Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| import datetime | |
| import pandas as pd | |
| import tqdm.auto | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from huggingface_hub import HfApi | |
| import gradio as gr | |
| import datasets # Ensure the datasets library is imported | |
| from datetime import timezone | |
| import atexit # To gracefully shut down the scheduler | |
| import logging # For logging purposes | |
| # --- Logging Configuration --- | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # --- Data Loading and Processing --- | |
| api = HfApi() | |
| def get_df() -> pd.DataFrame: | |
| """ | |
| Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns, | |
| and adds a 'paper_page' link for each paper. | |
| """ | |
| try: | |
| # Load datasets | |
| logger.info("Loading 'daily-papers' dataset.") | |
| df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas() | |
| logger.info("Loading 'daily-papers-stats' dataset.") | |
| df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas() | |
| # Merge datasets on 'arxiv_id' | |
| logger.info("Merging datasets on 'arxiv_id'.") | |
| df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id", suffixes=('_papers', '_stats')) | |
| # Reverse the DataFrame to have the latest papers first | |
| df = df[::-1].reset_index(drop=True) | |
| # Ensure 'date' is in datetime format and handle missing dates | |
| logger.info("Processing 'date' column.") | |
| df["date"] = pd.to_datetime(df["date"], errors='coerce') | |
| df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) | |
| # Prepare the DataFrame by removing 'abstract' | |
| logger.info("Removing 'abstract' column if present.") | |
| if 'abstract' in df.columns: | |
| df = df.drop(columns=['abstract']) | |
| # Add 'paper_page' links | |
| logger.info("Adding 'paper_page' links.") | |
| df["paper_page"] = df["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}") | |
| # Verify that 'date' column exists | |
| if 'date' not in df.columns: | |
| logger.error("'date' column is missing from the DataFrame. Filling with current date.") | |
| df["date"] = datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d") | |
| logger.info("DataFrame preparation complete.") | |
| return df | |
| except Exception as e: | |
| logger.error(f"Error in get_df: {e}") | |
| return pd.DataFrame() # Return empty DataFrame on error | |
| class Prettifier: | |
| """ | |
| Converts raw DataFrame rows into a prettified format suitable for display. | |
| """ | |
| REQUIRED_COLUMNS = ["arxiv_id", "date_display", "date", "paper_page", "title", "github", "π", "π¬"] | |
| def get_github_link(link: str) -> str: | |
| if not link: | |
| return "" | |
| return Prettifier.create_link("github", link) | |
| def create_link(text: str, url: str) -> str: | |
| return f'<a href="{url}" target="_blank">{text}</a>' | |
| def __call__(self, df: pd.DataFrame) -> pd.DataFrame: | |
| new_rows = [] | |
| for _, row in df.iterrows(): | |
| # Handle date_display as a clickable link | |
| date_display = Prettifier.create_link(row.get("date", ""), f"https://huggingface.co/papers?date={row.get('date', '')}") | |
| new_row = { | |
| "arxiv_id": row.get("arxiv_id", ""), # Include arxiv_id | |
| "date_display": date_display, # For display | |
| "date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")), # For internal calculations | |
| "paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")), | |
| "title": row.get("title", "No title"), | |
| "github": Prettifier.get_github_link(row.get("github", "")), | |
| "π": row.get("upvotes", 0), | |
| "π¬": row.get("num_comments", 0), | |
| } | |
| new_rows.append(new_row) | |
| # If no rows, return empty DataFrame with required columns to prevent KeyError | |
| if not new_rows: | |
| return pd.DataFrame(columns=self.REQUIRED_COLUMNS) | |
| return pd.DataFrame(new_rows) | |
| class PaperList: | |
| """ | |
| Manages the list of papers. | |
| """ | |
| COLUMN_INFO = [ | |
| ["arxiv_id", "str"], # Added arxiv_id | |
| ["date_display", "markdown"], # For display | |
| ["date", "str"], # For internal use | |
| ["paper_page", "markdown"], | |
| ["title", "str"], | |
| ["github", "markdown"], | |
| ["π", "number"], | |
| ["π¬", "number"], | |
| ] | |
| def __init__(self, df: pd.DataFrame): | |
| self.df_raw = df | |
| self._prettifier = Prettifier() | |
| self.df_prettified = self._prettifier(df).loc[:, self.column_names] | |
| def column_names(self): | |
| return [col[0] for col in self.COLUMN_INFO] | |
| def column_datatype(self): | |
| return [col[1] for col in self.COLUMN_INFO] | |
| def get_prettified_df(self) -> pd.DataFrame: | |
| """ | |
| Returns the prettified DataFrame. | |
| """ | |
| return self.df_prettified | |
| # --- Sorting and Pagination Management --- | |
| class PaperManager: | |
| """ | |
| Manages sorting and pagination for the list of papers. | |
| """ | |
| def __init__(self, paper_list: PaperList, papers_per_page=30): | |
| self.paper_list = paper_list | |
| self.papers_per_page = papers_per_page | |
| self.sort_method = "hot" # Default sort method | |
| self.sort_papers() | |
| # 'current_page' and 'total_pages' are set in 'sort_papers()' | |
| def calculate_score(self, row): | |
| """ | |
| Calculate the score of a paper based on upvotes and age. | |
| This mimics the "hotness" algorithm used by platforms like Hacker News. | |
| """ | |
| upvotes = row.get('upvotes', 0) # Corrected from 'π' to 'upvotes' | |
| date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) | |
| try: | |
| published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc) | |
| except ValueError: | |
| # If parsing fails, use current time to minimize the impact on sorting | |
| published_time = datetime.datetime.now(timezone.utc) | |
| time_diff = datetime.datetime.now(timezone.utc) - published_time | |
| time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours | |
| # Avoid division by zero and apply the hotness formula | |
| score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0 | |
| return score | |
| def sort_papers(self): | |
| """ | |
| Sorts the papers based on the current sort method. | |
| """ | |
| df = self.paper_list.df_raw.copy() | |
| if self.sort_method == "hot": | |
| if not df.empty: | |
| df = df.drop(columns=['score'], errors='ignore') # Remove existing 'score' column if present | |
| df['score'] = df.apply(self.calculate_score, axis=1) | |
| df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score']) | |
| else: | |
| df_sorted = df | |
| elif self.sort_method == "new": | |
| df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date' | |
| else: | |
| df_sorted = df | |
| self.paper_list.df_raw = df_sorted.reset_index(drop=True) | |
| self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names] | |
| self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1) | |
| self.current_page = 1 | |
| logger.info(f"Papers sorted by {self.sort_method}. Total pages: {self.total_pages}") | |
| def set_sort_method(self, method, time_frame=None): | |
| """ | |
| Sets the sort method ('hot', 'new') and re-sorts the papers. | |
| """ | |
| if method not in ["hot", "new"]: | |
| method = "hot" | |
| logger.info(f"Setting sort method to: {method}") | |
| self.sort_method = method | |
| self.sort_papers() | |
| return True # Assume success | |
| def get_current_page_papers(self) -> str: | |
| """ | |
| Retrieves the HTML string of the current page's papers. | |
| """ | |
| start = (self.current_page - 1) * self.papers_per_page | |
| end = start + self.papers_per_page | |
| current_papers = self.paper_list.df_prettified.iloc[start:end] | |
| if current_papers.empty: | |
| return "<div class='no-papers'>No papers available for this page.</div>" | |
| papers_html = "".join([self.format_paper(row, idx + start + 1) for idx, row in current_papers.iterrows()]) | |
| return f""" | |
| <table border="0" cellpadding="0" cellspacing="0" class="itemlist"> | |
| {papers_html} | |
| </table> | |
| """ | |
| def format_paper(self, row, rank): | |
| """ | |
| Formats a single paper entry into HTML. | |
| """ | |
| title = row.get('title', 'No title') | |
| paper_id = row.get('arxiv_id', '') | |
| url = f"https://huggingface.co/papers/{paper_id}" | |
| upvotes = row.get('π', 0) | |
| comments = row.get('π¬', 0) | |
| date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) | |
| try: | |
| published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc) | |
| except ValueError: | |
| published_time = datetime.datetime.now(timezone.utc) | |
| time_diff = datetime.datetime.now(timezone.utc) - published_time | |
| time_ago_days = time_diff.days | |
| time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today" | |
| return f""" | |
| <tr class="athing"> | |
| <td align="right" valign="top" class="title"><span class="rank">{rank}.</span></td> | |
| <td valign="top" class="title"> | |
| <a href="{url}" class="storylink" target="_blank">{title}</a> | |
| </td> | |
| </tr> | |
| <tr> | |
| <td colspan="1"></td> | |
| <td class="subtext"> | |
| <span class="score">{upvotes} upvotes</span><br> | |
| {time_ago} | <a href="#">{comments} comments</a> | |
| </td> | |
| </tr> | |
| <tr style="height:5px"></tr> | |
| """ | |
| def next_page(self) -> str: | |
| """ | |
| Navigates to the next page if possible. | |
| """ | |
| if self.current_page < self.total_pages: | |
| self.current_page += 1 | |
| logger.info(f"Navigated to page {self.current_page}.") | |
| else: | |
| logger.info("Already on the last page.") | |
| return self.get_current_page_papers() | |
| def prev_page(self) -> str: | |
| """ | |
| Navigates to the previous page if possible. | |
| """ | |
| if self.current_page > 1: | |
| self.current_page -= 1 | |
| logger.info(f"Navigated to page {self.current_page}.") | |
| else: | |
| logger.info("Already on the first page.") | |
| return self.get_current_page_papers() | |
| def refresh(self) -> str: | |
| """ | |
| Refreshes the current list of papers. | |
| """ | |
| logger.info("Refreshing papers.") | |
| self.sort_papers() | |
| return self.get_current_page_papers() | |
| # Initialize PaperList and PaperManager | |
| def initialize_paper_manager() -> str: | |
| """ | |
| Initializes the PaperList and PaperManager with the current DataFrame. | |
| """ | |
| df = get_df() | |
| if df.empty: | |
| logger.warning("Initialized with an empty DataFrame.") | |
| paper_list = PaperList(df) | |
| manager = PaperManager(paper_list) | |
| logger.info("PaperManager initialized.") | |
| return manager.get_current_page_papers() # Return HTML string instead of the manager object | |
| paper_manager = None # Initialize globally | |
| def setup_paper_manager(): | |
| """ | |
| Sets up the global PaperManager instance. | |
| """ | |
| global paper_manager | |
| df = get_df() | |
| paper_list = PaperList(df) | |
| paper_manager = PaperManager(paper_list) | |
| logger.info("PaperManager setup complete.") | |
| # Initialize PaperManager at the start | |
| setup_paper_manager() | |
| def update_paper_manager() -> str: | |
| """ | |
| Updates the global PaperManager with the latest DataFrame. | |
| """ | |
| global paper_manager | |
| logger.info("Updating PaperManager with latest data.") | |
| df = get_df() | |
| if df.empty: | |
| logger.warning("DataFrame is empty. Skipping update.") | |
| return paper_manager.get_current_page_papers() | |
| paper_manager.paper_list = PaperList(df) | |
| paper_manager.sort_papers() | |
| logger.info("PaperManager updated successfully.") | |
| return paper_manager.get_current_page_papers() | |
| # Scheduler for updating paper list every hour | |
| scheduler_data = BackgroundScheduler() | |
| scheduler_data.add_job( | |
| func=update_paper_manager, | |
| trigger="cron", | |
| minute=0, # Every hour at minute 0 | |
| timezone="UTC", | |
| misfire_grace_time=60, | |
| ) | |
| scheduler_data.start() | |
| logger.info("BackgroundScheduler started.") | |
| # Ensure the scheduler shuts down gracefully on exit | |
| atexit.register(lambda: scheduler_data.shutdown()) | |
| logger.info("Scheduler shutdown registered.") | |
| # --- Gradio Interface Functions --- | |
| def change_sort_method_ui(method: str) -> str: | |
| """ | |
| Changes the sort method based on user selection. | |
| """ | |
| logger.info(f"Changing sort method to: {method}") | |
| paper_manager.set_sort_method(method.lower()) | |
| return paper_manager.get_current_page_papers() | |
| # --- CSS Styling --- | |
| css = """ | |
| /* Hacker News-like CSS */ | |
| body { | |
| background-color: white; | |
| font-family: Verdana, Geneva, sans-serif; | |
| margin: 0; | |
| padding: 0; | |
| } | |
| a { | |
| color: #0000ff; | |
| text-decoration: none; | |
| } | |
| a:visited { | |
| color: #551A8B; | |
| } | |
| .container { | |
| width: 85%; | |
| margin: auto; | |
| } | |
| table { | |
| width: 100%; | |
| } | |
| .header-table { | |
| width: 100%; | |
| background-color: #ff6600; | |
| padding: 2px 10px; | |
| } | |
| .header-table a { | |
| color: black; | |
| font-weight: bold; | |
| font-size: 14pt; | |
| text-decoration: none; | |
| } | |
| .header-table .sort-links a { | |
| color: black; | |
| font-weight: normal; | |
| font-size: 14pt; | |
| margin-left: 15px; | |
| cursor: pointer; | |
| } | |
| .itemlist .athing { | |
| background-color: #f6f6ef; | |
| } | |
| .rank { | |
| font-size: 14pt; | |
| color: #828282; | |
| padding-right: 5px; | |
| } | |
| .storylink { | |
| font-size: 10pt; | |
| } | |
| .subtext { | |
| font-size: 8pt; | |
| color: #828282; | |
| padding-left: 40px; | |
| } | |
| .subtext a { | |
| color: #828282; | |
| text-decoration: none; | |
| } | |
| .no-papers { | |
| text-align: center; | |
| color: #828282; | |
| padding: 1rem; | |
| font-size: 14pt; | |
| } | |
| @media (max-width: 640px) { | |
| .header-table a { | |
| font-size: 12pt; | |
| } | |
| .sort-links a { | |
| font-size: 12pt; | |
| margin-left: 10px; | |
| } | |
| .storylink { | |
| font-size: 9pt; | |
| } | |
| .subtext { | |
| font-size: 7pt; | |
| } | |
| } | |
| /* Dark mode */ | |
| @media (prefers-color-scheme: dark) { | |
| body { | |
| background-color: #121212; | |
| color: #e0e0e0; | |
| } | |
| a { | |
| color: #add8e6; | |
| } | |
| a:visited { | |
| color: #9370db; | |
| } | |
| .header-table { | |
| background-color: #ff6600; | |
| } | |
| .header-table a { | |
| color: black; | |
| } | |
| .header-table .sort-links a { | |
| color: black; | |
| } | |
| .itemlist .athing { | |
| background-color: #1e1e1e; | |
| } | |
| .rank { | |
| color: #b0b0b0; | |
| } | |
| .subtext { | |
| color: #b0b0b0; | |
| } | |
| .subtext a { | |
| color: #b0b0b0; | |
| } | |
| .no-papers { | |
| color: #b0b0b0; | |
| } | |
| } | |
| """ | |
| # --- Initialize Gradio Blocks --- | |
| demo = gr.Blocks(css=css) | |
| with demo: | |
| with gr.Column(elem_classes=["container"]): | |
| # Accordion for Submission Instructions | |
| with gr.Accordion("How to Submit a Paper", open=False): | |
| gr.Markdown(""" | |
| **Submit the paper to Daily Papers:** | |
| [https://huggingface.co/papers/submit](https://huggingface.co/papers/submit) | |
| Once your paper is submitted, it will automatically appear in this demo. | |
| """) | |
| # Hacker News-like Header with "Hot" and "New" sort options | |
| with gr.Row(): | |
| gr.HTML(""" | |
| <table border="0" cellpadding="0" cellspacing="0" class="header-table"> | |
| <tr> | |
| <td> | |
| <span class="pagetop"> | |
| <b class="hnname"><a href="#">Daily Papers</a></b> | |
| </span> | |
| </td> | |
| <td align="right" class="sort-links"> | |
| <a href="#" onclick="HotSort()">Hot</a> | | |
| <a href="#" onclick="NewSort()">New</a> | |
| </td> | |
| </tr> | |
| </table> | |
| <script> | |
| function HotSort() { | |
| gradioApp().getElementById('hot_sort_button').click(); | |
| } | |
| function NewSort() { | |
| gradioApp().getElementById('new_sort_button').click(); | |
| } | |
| </script> | |
| """) | |
| # Hidden buttons to trigger sort methods | |
| hot_sort = gr.Button("Hot Sort", visible=False, elem_id="hot_sort_button") | |
| new_sort = gr.Button("New Sort", visible=False, elem_id="new_sort_button") | |
| # Paper list | |
| paper_list = gr.HTML() | |
| # Navigation Buttons | |
| with gr.Row(): | |
| prev_button = gr.Button("Prev") | |
| next_button = gr.Button("Next") | |
| # Load papers on app start | |
| demo.load( | |
| fn=lambda: paper_manager.get_current_page_papers(), | |
| outputs=[paper_list] | |
| ) | |
| # Button clicks for pagination | |
| prev_button.click(paper_manager.prev_page, outputs=[paper_list]) | |
| next_button.click(paper_manager.next_page, outputs=[paper_list]) | |
| # Hidden buttons trigger sort methods | |
| hot_sort.click( | |
| fn=lambda: change_sort_method_ui("hot"), | |
| inputs=[], | |
| outputs=[paper_list] | |
| ) | |
| new_sort.click( | |
| fn=lambda: change_sort_method_ui("new"), | |
| inputs=[], | |
| outputs=[paper_list] | |
| ) | |
| # Footer - Removed as per request | |
| # Removed the footer markdown section | |
| # --- Launch the App --- | |
| if __name__ == "__main__": | |
| demo.launch() | |