dailypapershackernews-dev

Runtime error

App Files Files Community

akhaliq HF Staff commited on Sep 20, 2024

Commit

22d5f09

verified ·

1 Parent(s): 33ea647

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -8

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from huggingface_hub import HfApi
 import gradio as gr
 import datasets  # Ensure the datasets library is imported
 from datetime import timezone
 import atexit  # To gracefully shut down the scheduler
@@ -21,10 +22,37 @@ logger = logging.getLogger(__name__)
 api = HfApi()
 def get_df() -> pd.DataFrame:
     """
     Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
-    and adds a 'paper_page' link for each paper.
     """
     try:
         # Load datasets
@@ -52,6 +80,17 @@ def get_df() -> pd.DataFrame:
             info = row.copy()
             if "abstract" in info:
                 del info["abstract"]
             paper_info.append(info)
         df_prepared = pd.DataFrame(paper_info)
@@ -70,7 +109,11 @@ class Prettifier:
     """
     Converts raw DataFrame rows into a prettified format suitable for display.
     """
-    REQUIRED_COLUMNS = ["arxiv_id", "date_display", "date", "paper_page", "title", "github", "👍", "💬"]
     @staticmethod
     def get_github_link(link: str) -> str:
@@ -97,6 +140,9 @@ class Prettifier:
                 "github": Prettifier.get_github_link(row.get("github", "")),
                 "👍": row.get("upvotes", 0),
                 "💬": row.get("num_comments", 0),
             }
             new_rows.append(new_row)
@@ -120,6 +166,9 @@ class PaperList:
         ["github", "markdown"],
         ["👍", "number"],
         ["💬", "number"],
     ]
     def __init__(self, df: pd.DataFrame):
@@ -212,6 +261,12 @@ class PaperManager:
             df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc, ambiguous='NaT', nonexistent='NaT')
             df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
             df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
         else:
             df_sorted = df
@@ -219,13 +274,15 @@ class PaperManager:
         self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names]
         self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
         self.current_page = 1
     def set_sort_method(self, method, time_frame=None):
         """
-        Sets the sort method ('hot', 'new', 'top') and re-sorts the papers.
         If 'top' is selected, also sets the time frame.
         """
-        if method not in ["hot", "new", "top"]:
             method = "hot"
         logger.info(f"Setting sort method to: {method}")
         self.sort_method = method
@@ -262,6 +319,9 @@ class PaperManager:
         url = f"https://huggingface.co/papers/{paper_id}"
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
         date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
             published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
@@ -282,7 +342,8 @@ class PaperManager:
             <td colspan="1"></td>
             <td class="subtext">
                 <span class="score">{upvotes} upvotes</span><br>
-                {time_ago} | <a href="#">{comments} comments</a>
             </td>
         </tr>
         <tr style="height:5px"></tr>
@@ -325,14 +386,16 @@ def initialize_paper_manager() -> str:
     Initializes the PaperList and PaperManager with the current DataFrame.
     """
     df = get_df()
     paper_list = PaperList(df)
     manager = PaperManager(paper_list)
     return manager.get_current_page_papers()  # Return HTML string instead of the manager object
 paper_manager = None  # Initialize globally
 def setup_paper_manager():
     """
     Sets up the global PaperManager instance.
@@ -388,7 +451,9 @@ def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
     Changes the sort method and, if 'top' is selected, sets the time frame.
     """
     logger.info(f"Changing sort method to: {method} with time frame: {time_frame}")
-    if method.lower() == "top":
         paper_manager.set_sort_method(method.lower(), time_frame)
     else:
         paper_manager.set_sort_method(method.lower())
@@ -529,6 +594,7 @@ table {
 }
 """
 # --- Initialize Gradio Blocks ---
 demo = gr.Blocks(css=css)
@@ -562,7 +628,7 @@ with demo:
         # Sort Options and Time Frame (conditionally visible)
         with gr.Row():
             sort_radio = gr.Radio(
-                choices=["Hot", "New", "Top"],
                 value="Hot",
                 label="Sort By",
                 interactive=True

 import gradio as gr
 import datasets  # Ensure the datasets library is imported
+import requests   # For making API calls
 from datetime import timezone
 import atexit  # To gracefully shut down the scheduler
 api = HfApi()
+def get_repo_counts(arxiv_id: str) -> dict:
+    """
+    Fetches the number of models, datasets, and Spaces linked to a given arxiv_id using Hugging Face API.
+    """
+    url = f"https://huggingface.co/api/arxiv/{arxiv_id}/repos"
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        data = response.json()
+        models = data.get('models', [])
+        datasets_list = data.get('datasets', [])
+        spaces = data.get('spaces', [])
+        return {
+            'models_count': len(models),
+            'datasets_count': len(datasets_list),
+            'spaces_count': len(spaces)
+        }
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching repo counts for {arxiv_id}: {e}")
+        return {
+            'models_count': 0,
+            'datasets_count': 0,
+            'spaces_count': 0
+        }
 def get_df() -> pd.DataFrame:
     """
     Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
+    adds a 'paper_page' link for each paper, and fetches counts of models, datasets, and Spaces linked to each paper.
     """
     try:
         # Load datasets
             info = row.copy()
             if "abstract" in info:
                 del info["abstract"]
+            # Fetch repo counts
+            arxiv_id = info.get("arxiv_id", "")
+            if arxiv_id:
+                counts = get_repo_counts(arxiv_id)
+                info.update(counts)
+            else:
+                info.update({
+                    'models_count': 0,
+                    'datasets_count': 0,
+                    'spaces_count': 0
+                })
             paper_info.append(info)
         df_prepared = pd.DataFrame(paper_info)
     """
     Converts raw DataFrame rows into a prettified format suitable for display.
     """
+    REQUIRED_COLUMNS = [
+        "arxiv_id", "date_display", "date", "paper_page",
+        "title", "github", "👍", "💬",
+        "models_count", "datasets_count", "spaces_count"
+    ]
     @staticmethod
     def get_github_link(link: str) -> str:
                 "github": Prettifier.get_github_link(row.get("github", "")),
                 "👍": row.get("upvotes", 0),
                 "💬": row.get("num_comments", 0),
+                "models_count": row.get("models_count", 0),
+                "datasets_count": row.get("datasets_count", 0),
+                "spaces_count": row.get("spaces_count", 0),
             }
             new_rows.append(new_row)
         ["github", "markdown"],
         ["👍", "number"],
         ["💬", "number"],
+        ["models_count", "number"],
+        ["datasets_count", "number"],
+        ["spaces_count", "number"],
     ]
     def __init__(self, df: pd.DataFrame):
             df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc, ambiguous='NaT', nonexistent='NaT')
             df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
             df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
+        elif self.sort_method == "most_models":
+            df_sorted = df.sort_values(by='models_count', ascending=False)
+        elif self.sort_method == "most_datasets":
+            df_sorted = df.sort_values(by='datasets_count', ascending=False)
+        elif self.sort_method == "most_spaces":
+            df_sorted = df.sort_values(by='spaces_count', ascending=False)
         else:
             df_sorted = df
         self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names]
         self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
         self.current_page = 1
+        logger.info(f"Papers sorted by {self.sort_method}. Total pages: {self.total_pages}")
     def set_sort_method(self, method, time_frame=None):
         """
+        Sets the sort method ('hot', 'new', 'top', 'most_models', 'most_datasets', 'most_spaces') and re-sorts the papers.
         If 'top' is selected, also sets the time frame.
         """
+        valid_methods = ["hot", "new", "top", "most_models", "most_datasets", "most_spaces"]
+        if method not in valid_methods:
             method = "hot"
         logger.info(f"Setting sort method to: {method}")
         self.sort_method = method
         url = f"https://huggingface.co/papers/{paper_id}"
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
+        models = row.get('models_count', 0)
+        datasets_count = row.get('datasets_count', 0)
+        spaces = row.get('spaces_count', 0)
         date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
             published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
             <td colspan="1"></td>
             <td class="subtext">
                 <span class="score">{upvotes} upvotes</span><br>
+                {time_ago} | <a href="#">{comments} comments</a><br>
+                Models: {models} | Datasets: {datasets_count} | Spaces: {spaces}
             </td>
         </tr>
         <tr style="height:5px"></tr>
     Initializes the PaperList and PaperManager with the current DataFrame.
     """
     df = get_df()
+    if df.empty:
+        logger.warning("Initialized with an empty DataFrame.")
     paper_list = PaperList(df)
     manager = PaperManager(paper_list)
+    logger.info("PaperManager initialized.")
     return manager.get_current_page_papers()  # Return HTML string instead of the manager object
 paper_manager = None  # Initialize globally
 def setup_paper_manager():
     """
     Sets up the global PaperManager instance.
     Changes the sort method and, if 'top' is selected, sets the time frame.
     """
     logger.info(f"Changing sort method to: {method} with time frame: {time_frame}")
+    if method.lower() in ["most_models", "most_datasets", "most_spaces"]:
+        paper_manager.set_sort_method(method.lower())
+    elif method.lower() == "top":
         paper_manager.set_sort_method(method.lower(), time_frame)
     else:
         paper_manager.set_sort_method(method.lower())
 }
 """
 # --- Initialize Gradio Blocks ---
 demo = gr.Blocks(css=css)
         # Sort Options and Time Frame (conditionally visible)
         with gr.Row():
             sort_radio = gr.Radio(
+                choices=["Hot", "New", "Top", "Most Models", "Most Datasets", "Most Spaces"],
                 value="Hot",
                 label="Sort By",
                 interactive=True