Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| import datetime | |
| import operator | |
| import pandas as pd | |
| import tqdm.auto | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from huggingface_hub import HfApi | |
| from ragatouille import RAGPretrainedModel | |
| import gradio as gr | |
| from gradio_calendar import Calendar | |
| import datasets | |
| import requests | |
| # --- Data Loading and Processing --- | |
| api = HfApi() | |
| INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index" | |
| INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/" | |
| api.snapshot_download( | |
| repo_id=INDEX_REPO_ID, | |
| repo_type="dataset", | |
| local_dir=INDEX_DIR_PATH, | |
| ) | |
| abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH) | |
| # Run once to initialize the retriever | |
| abstract_retriever.search("LLM") | |
| def update_abstract_index() -> None: | |
| global abstract_retriever | |
| api.snapshot_download( | |
| repo_id=INDEX_REPO_ID, | |
| repo_type="dataset", | |
| local_dir=INDEX_DIR_PATH, | |
| ) | |
| abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH) | |
| abstract_retriever.search("LLM") | |
| # Scheduler for updating abstract index every hour | |
| scheduler_abstract = BackgroundScheduler() | |
| scheduler_abstract.add_job( | |
| func=update_abstract_index, | |
| trigger="cron", | |
| minute=0, # Every hour at minute 0 | |
| timezone="UTC", | |
| misfire_grace_time=3 * 60, | |
| ) | |
| scheduler_abstract.start() | |
| def get_df() -> pd.DataFrame: | |
| df = pd.merge( | |
| left=datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(), | |
| right=datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(), | |
| on="arxiv_id", | |
| ) | |
| df = df[::-1].reset_index(drop=True) | |
| df["date"] = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d") | |
| paper_info = [] | |
| for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): | |
| info = row.copy() | |
| del info["abstract"] | |
| info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}" | |
| paper_info.append(info) | |
| return pd.DataFrame(paper_info) | |
| class Prettifier: | |
| def get_github_link(link: str) -> str: | |
| if not link: | |
| return "" | |
| return Prettifier.create_link("github", link) | |
| def create_link(text: str, url: str) -> str: | |
| return f'<a href="{url}" target="_blank">{text}</a>' | |
| def to_div(text: str | None, category_name: str) -> str: | |
| if text is None: | |
| text = "" | |
| class_name = f"{category_name}-{text.lower()}" | |
| return f'<div class="{class_name}">{text}</div>' | |
| def __call__(self, df: pd.DataFrame) -> pd.DataFrame: | |
| new_rows = [] | |
| for _, row in df.iterrows(): | |
| new_row = { | |
| "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"), | |
| "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page), | |
| "title": row["title"], | |
| "github": self.get_github_link(row.github), | |
| "๐": row["upvotes"], | |
| "๐ฌ": row["num_comments"], | |
| } | |
| new_rows.append(new_row) | |
| return pd.DataFrame(new_rows) | |
| class PaperList: | |
| COLUMN_INFO = [ | |
| ["date", "markdown"], | |
| ["paper_page", "markdown"], | |
| ["title", "str"], | |
| ["github", "markdown"], | |
| ["๐", "number"], | |
| ["๐ฌ", "number"], | |
| ] | |
| def __init__(self, df: pd.DataFrame): | |
| self.df_raw = df | |
| self._prettifier = Prettifier() | |
| self.df_prettified = self._prettifier(df).loc[:, self.column_names] | |
| def column_names(self): | |
| return list(map(operator.itemgetter(0), self.COLUMN_INFO)) | |
| def column_datatype(self): | |
| return list(map(operator.itemgetter(1), self.COLUMN_INFO)) | |
| def search( | |
| self, | |
| start_date: datetime.datetime, | |
| end_date: datetime.datetime, | |
| title_search_query: str, | |
| abstract_search_query: str, | |
| max_num_to_retrieve: int, | |
| ) -> pd.DataFrame: | |
| df = self.df_raw.copy() | |
| df["date"] = pd.to_datetime(df["date"]) | |
| # Filter by date | |
| df = df[(df["date"] >= start_date) & (df["date"] <= end_date)] | |
| df["date"] = df["date"].dt.strftime("%Y-%m-%d") | |
| # Filter by title | |
| if title_search_query: | |
| df = df[df["title"].str.contains(title_search_query, case=False)] | |
| # Filter by abstract | |
| if abstract_search_query: | |
| results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve) | |
| remaining_ids = set(df["arxiv_id"]) | |
| found_id_set = set() | |
| found_ids = [] | |
| for x in results: | |
| arxiv_id = x["document_id"] | |
| if arxiv_id not in remaining_ids: | |
| continue | |
| if arxiv_id in found_id_set: | |
| continue | |
| found_id_set.add(arxiv_id) | |
| found_ids.append(arxiv_id) | |
| df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index() | |
| df_prettified = self._prettifier(df).loc[:, self.column_names] | |
| return df_prettified | |
| # Initialize PaperList | |
| paper_list = PaperList(get_df()) | |
| def update_paper_list() -> None: | |
| global paper_list | |
| paper_list = PaperList(get_df()) | |
| # Scheduler for updating paper list every hour | |
| scheduler_data = BackgroundScheduler() | |
| scheduler_data.add_job( | |
| func=update_paper_list, | |
| trigger="cron", | |
| minute=0, # Every hour at minute 0 | |
| timezone="UTC", | |
| misfire_grace_time=60, | |
| ) | |
| scheduler_data.start() | |
| # --- Gradio App --- | |
| DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)" | |
| FOOT_NOTE = """\ | |
| Related useful Spaces: | |
| - [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien) | |
| - [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy) | |
| - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung) | |
| """ | |
| # --- Sorting and Pagination Management --- | |
| class PaperManager: | |
| def __init__(self, paper_list: PaperList, papers_per_page=30): | |
| self.paper_list = paper_list | |
| self.papers_per_page = papers_per_page | |
| self.current_page = 1 | |
| self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1) | |
| self.sort_method = "hot" # Default sort method | |
| def calculate_score(self, paper): | |
| """ | |
| Calculate the score of a paper based on upvotes and age. | |
| This mimics the "hotness" algorithm used by platforms like Hacker News. | |
| """ | |
| upvotes = paper.get('upvotes', 0) | |
| published_at_str = paper.get('date', datetime.datetime.now(timezone.utc).isoformat()) | |
| try: | |
| published_time = datetime.datetime.fromisoformat(published_at_str.replace('Z', '+00:00')) | |
| except ValueError: | |
| # If parsing fails, use current time to minimize the impact on sorting | |
| published_time = datetime.datetime.now(datetime.timezone.utc) | |
| time_diff = datetime.datetime.now(datetime.timezone.utc) - published_time | |
| time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours | |
| # Avoid division by zero and apply the hotness formula | |
| score = upvotes / ((time_diff_hours + 2) ** 1.5) | |
| return score | |
| def sort_papers(self): | |
| df = self.paper_list.df_raw.copy() | |
| if self.sort_method == "hot": | |
| df['score'] = df.apply(self.calculate_score, axis=1) | |
| df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score']) | |
| elif self.sort_method == "new": | |
| df_sorted = df.sort_values(by='date', ascending=False) | |
| else: | |
| df_sorted = df | |
| self.paper_list.df_raw = df_sorted.reset_index(drop=True) | |
| self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names] | |
| self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1) | |
| self.current_page = 1 | |
| def set_sort_method(self, method): | |
| if method not in ["hot", "new"]: | |
| method = "hot" | |
| print(f"Setting sort method to: {method}") | |
| self.sort_method = method | |
| self.sort_papers() | |
| return True # Assume success | |
| def get_current_page_papers(self): | |
| start = (self.current_page - 1) * self.papers_per_page | |
| end = start + self.papers_per_page | |
| current_papers = self.paper_list.df_prettified.iloc[start:end] | |
| return current_papers | |
| def next_page(self): | |
| if self.current_page < self.total_pages: | |
| self.current_page += 1 | |
| return self.get_current_page_papers() | |
| def prev_page(self): | |
| if self.current_page > 1: | |
| self.current_page -= 1 | |
| return self.get_current_page_papers() | |
| def refresh(self): | |
| self.sort_papers() | |
| return self.get_current_page_papers() | |
| # Initialize PaperManager | |
| paper_manager = PaperManager(paper_list) | |
| def refresh_paper_manager(): | |
| global paper_manager | |
| paper_manager = PaperManager(paper_list) | |
| if paper_manager.sort_method: | |
| paper_manager.sort_papers() | |
| return paper_manager.get_current_page_papers() | |
| # --- Gradio Interface Functions --- | |
| def update_num_papers(current_df: pd.DataFrame) -> str: | |
| return f"{len(current_df)} / {len(paper_manager.paper_list.df_raw)}" | |
| def perform_search( | |
| start_date: datetime.datetime, | |
| end_date: datetime.datetime, | |
| search_title: str, | |
| search_abstract: str, | |
| max_num_to_retrieve: int, | |
| sort_method: str | |
| ) -> pd.DataFrame: | |
| # Update sort method | |
| paper_manager.set_sort_method(sort_method.lower()) | |
| # Perform search | |
| searched_df = paper_manager.paper_list.search(start_date, end_date, search_title, search_abstract, max_num_to_retrieve) | |
| # Update PaperList with searched results | |
| paper_manager.paper_list.df_raw = searched_df.copy() | |
| paper_manager.paper_list.df_prettified = paper_manager.paper_list._prettifier(searched_df).loc[:, paper_manager.paper_list.column_names] | |
| paper_manager.total_pages = max((len(searched_df) + paper_manager.papers_per_page - 1) // paper_manager.papers_per_page, 1) | |
| paper_manager.current_page = 1 | |
| # Apply sorting | |
| paper_manager.sort_papers() | |
| return paper_manager.get_current_page_papers() | |
| def change_sort_method(method: str) -> pd.DataFrame: | |
| paper_manager.set_sort_method(method.lower()) | |
| return paper_manager.get_current_page_papers() | |
| def get_initial_papers() -> pd.DataFrame: | |
| return paper_manager.get_current_page_papers() | |
| # --- CSS Styling --- | |
| css = """ | |
| /* Existing CSS remains unchanged */ | |
| body { | |
| background-color: white; | |
| font-family: Verdana, Geneva, sans-serif; | |
| margin: 0; | |
| padding: 0; | |
| } | |
| a { | |
| color: #0000ff; | |
| text-decoration: none; | |
| } | |
| a:visited { | |
| color: #551A8B; | |
| } | |
| .container { | |
| width: 85%; | |
| margin: auto; | |
| } | |
| table { | |
| width: 100%; | |
| } | |
| .header-table { | |
| width: 100%; | |
| background-color: #ff6600; | |
| padding: 2px 10px; | |
| } | |
| .header-table a { | |
| color: black; | |
| font-weight: bold; | |
| font-size: 14pt; | |
| text-decoration: none; | |
| } | |
| .itemlist .athing { | |
| background-color: #f6f6ef; | |
| } | |
| .rank { | |
| font-size: 14pt; | |
| color: #828282; | |
| padding-right: 5px; | |
| } | |
| .storylink { | |
| font-size: 10pt; | |
| } | |
| .subtext { | |
| font-size: 8pt; | |
| color: #828282; | |
| padding-left: 40px; | |
| } | |
| .subtext a { | |
| color: #828282; | |
| text-decoration: none; | |
| } | |
| #refresh-button { | |
| background: none; | |
| border: none; | |
| color: black; | |
| font-weight: bold; | |
| font-size: 14pt; | |
| cursor: pointer; | |
| } | |
| .no-papers { | |
| text-align: center; | |
| color: #828282; | |
| padding: 1rem; | |
| font-size: 14pt; | |
| } | |
| @media (max-width: 640px) { | |
| .header-table a { | |
| font-size: 12pt; | |
| } | |
| .storylink { | |
| font-size: 9pt; | |
| } | |
| .subtext { | |
| font-size: 7pt; | |
| } | |
| } | |
| /* Dark mode */ | |
| @media (prefers-color-scheme: dark) { | |
| body { | |
| background-color: #121212; | |
| color: #e0e0e0; | |
| } | |
| a { | |
| color: #add8e6; | |
| } | |
| a:visited { | |
| color: #9370db; | |
| } | |
| .header-table { | |
| background-color: #ff6600; | |
| } | |
| .header-table a { | |
| color: black; | |
| } | |
| .itemlist .athing { | |
| background-color: #1e1e1e; | |
| } | |
| .rank { | |
| color: #b0b0b0; | |
| } | |
| .subtext { | |
| color: #b0b0b0; | |
| } | |
| .subtext a { | |
| color: #b0b0b0; | |
| } | |
| #refresh-button { | |
| color: #e0e0e0; | |
| } | |
| .no-papers { | |
| color: #b0b0b0; | |
| } | |
| } | |
| """ | |
| # --- Initialize Gradio Blocks --- | |
| demo = gr.Blocks(css=css) | |
| with demo: | |
| with gr.Column(elem_classes=["container"]): | |
| # Accordion for Submission Instructions | |
| with gr.Accordion("How to Submit a Paper", open=False): | |
| gr.Markdown(""" | |
| **Submit the paper to Daily Papers:** | |
| [https://huggingface.co/papers/submit](https://huggingface.co/papers/submit) | |
| Once your paper is submitted, it will automatically appear in this demo. | |
| """) | |
| # Header with Refresh Button | |
| with gr.Row(): | |
| gr.HTML(""" | |
| <table border="0" cellpadding="0" cellspacing="0" class="header-table"> | |
| <tr> | |
| <td> | |
| <span class="pagetop"> | |
| <b class="hnname"><a href="#">Daily Papers</a></b> | |
| </span> | |
| </td> | |
| <td align="right"> | |
| <button id="refresh-button">Refresh</button> | |
| </td> | |
| </tr> | |
| </table> | |
| """) | |
| # Sorting Options | |
| with gr.Row(): | |
| sort_radio = gr.Radio( | |
| choices=["Hot", "New"], | |
| value="Hot", | |
| label="Sort By", | |
| interactive=True | |
| ) | |
| # Search and Filter Inputs | |
| with gr.Group(): | |
| search_title = gr.Textbox(label="Search Title") | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| search_abstract = gr.Textbox( | |
| label="Search Abstract", | |
| info="The result may not be accurate as the abstract does not contain all the information.", | |
| ) | |
| with gr.Column(scale=1): | |
| max_num_to_retrieve = gr.Slider( | |
| label="Max Number to Retrieve", | |
| info="This is used only for search on abstracts.", | |
| minimum=1, | |
| maximum=1000, # Adjust as needed | |
| step=1, | |
| value=100, | |
| ) | |
| with gr.Row(): | |
| start_date = Calendar(label="Start Date", type="date", value="2023-05-05") | |
| end_date = Calendar(label="End Date", type="date", value=datetime.datetime.utcnow().strftime("%Y-%m-%d")) | |
| search_button = gr.Button("Search") | |
| # Number of Papers Display | |
| num_papers = gr.Textbox(label="Number of Papers", value=update_num_papers(paper_manager.get_current_page_papers()), interactive=False) | |
| # Paper List Display | |
| df_display = gr.DataFrame( | |
| value=paper_manager.get_current_page_papers(), | |
| datatype=paper_manager.paper_list.column_datatype, | |
| type="pandas", | |
| interactive=False, | |
| height=600, | |
| elem_id="table", | |
| column_widths=["10%", "10%", "60%", "10%", "5%", "5%"], | |
| wrap=True, | |
| ) | |
| # Pagination Buttons | |
| with gr.Row(): | |
| prev_button = gr.Button("Prev") | |
| next_button = gr.Button("Next") | |
| # Footer | |
| gr.Markdown(FOOT_NOTE) | |
| # Hidden Refresh Button | |
| refresh_button = gr.Button("Refresh", visible=False, elem_id="refresh-hidden") | |
| refresh_button.click(refresh_paper_manager, outputs=[df_display]) | |
| # Bind the visible Refresh button to the hidden one using JavaScript | |
| gr.HTML(""" | |
| <script> | |
| document.getElementById('refresh-button').addEventListener('click', function() { | |
| document.getElementById('refresh-hidden').click(); | |
| }); | |
| </script> | |
| """) | |
| # Event Handlers | |
| # Search Button Click | |
| search_button.click( | |
| fn=perform_search, | |
| inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve, sort_radio], | |
| outputs=[df_display], | |
| ).then( | |
| fn=update_num_papers, | |
| inputs=df_display, | |
| outputs=num_papers, | |
| queue=False, | |
| ) | |
| # Sort Radio Change | |
| sort_radio.change( | |
| fn=change_sort_method, | |
| inputs=[sort_radio], | |
| outputs=[df_display], | |
| ).then( | |
| fn=update_num_papers, | |
| inputs=df_display, | |
| outputs=num_papers, | |
| queue=False, | |
| ) | |
| # Pagination Buttons | |
| prev_button.click( | |
| fn=paper_manager.prev_page, | |
| inputs=None, | |
| outputs=[df_display], | |
| ).then( | |
| fn=update_num_papers, | |
| inputs=df_display, | |
| outputs=num_papers, | |
| queue=False, | |
| ) | |
| next_button.click( | |
| fn=paper_manager.next_page, | |
| inputs=None, | |
| outputs=[df_display], | |
| ).then( | |
| fn=update_num_papers, | |
| inputs=df_display, | |
| outputs=num_papers, | |
| queue=False, | |
| ) | |
| # Initial Load | |
| demo.load( | |
| fn=get_initial_papers, | |
| outputs=[df_display], | |
| ).then( | |
| fn=update_num_papers, | |
| inputs=df_display, | |
| outputs=num_papers, | |
| queue=False, | |
| ) | |
| # --- Launch the App --- | |
| if __name__ == "__main__": | |
| demo.launch() |