Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,7 +13,7 @@ from gradio_calendar import Calendar
|
|
| 13 |
import datasets
|
| 14 |
import requests
|
| 15 |
|
| 16 |
-
from datetime import timezone #
|
| 17 |
|
| 18 |
# --- Data Loading and Processing ---
|
| 19 |
|
|
@@ -27,7 +27,7 @@ api.snapshot_download(
|
|
| 27 |
local_dir=INDEX_DIR_PATH,
|
| 28 |
)
|
| 29 |
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
| 30 |
-
#
|
| 31 |
abstract_retriever.search("LLM")
|
| 32 |
|
| 33 |
|
|
@@ -56,18 +56,24 @@ scheduler_abstract.start()
|
|
| 56 |
|
| 57 |
|
| 58 |
def get_df() -> pd.DataFrame:
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
| 64 |
df = df[::-1].reset_index(drop=True)
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
paper_info = []
|
| 68 |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
|
| 69 |
info = row.copy()
|
| 70 |
-
|
|
|
|
| 71 |
info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
|
| 72 |
paper_info.append(info)
|
| 73 |
return pd.DataFrame(paper_info)
|
|
@@ -84,22 +90,32 @@ class Prettifier:
|
|
| 84 |
def create_link(text: str, url: str) -> str:
|
| 85 |
return f'<a href="{url}" target="_blank">{text}</a>'
|
| 86 |
|
| 87 |
-
@staticmethod
|
| 88 |
-
def to_div(text: str | None, category_name: str) -> str:
|
| 89 |
-
if text is None:
|
| 90 |
-
text = ""
|
| 91 |
-
class_name = f"{category_name}-{text.lower()}"
|
| 92 |
-
return f'<div class="{class_name}">{text}</div>'
|
| 93 |
-
|
| 94 |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 95 |
new_rows = []
|
| 96 |
for _, row in df.iterrows():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
new_row = {
|
| 98 |
"arxiv_id": row["arxiv_id"], # Include arxiv_id
|
| 99 |
-
"
|
|
|
|
| 100 |
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
|
| 101 |
"title": row["title"],
|
| 102 |
-
"
|
|
|
|
| 103 |
"π": row["upvotes"],
|
| 104 |
"π¬": row["num_comments"],
|
| 105 |
}
|
|
@@ -109,10 +125,12 @@ class Prettifier:
|
|
| 109 |
|
| 110 |
class PaperList:
|
| 111 |
COLUMN_INFO = [
|
| 112 |
-
["arxiv_id", "str"],
|
| 113 |
-
["
|
|
|
|
| 114 |
["paper_page", "markdown"],
|
| 115 |
["title", "str"],
|
|
|
|
| 116 |
["github", "markdown"],
|
| 117 |
["π", "number"],
|
| 118 |
["π¬", "number"],
|
|
@@ -140,17 +158,17 @@ class PaperList:
|
|
| 140 |
max_num_to_retrieve: int,
|
| 141 |
) -> pd.DataFrame:
|
| 142 |
df = self.df_raw.copy()
|
| 143 |
-
df["date"] = pd.to_datetime(df["date"])
|
| 144 |
|
| 145 |
# Filter by date
|
| 146 |
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
|
| 147 |
-
df["date"] = df["date"].dt.strftime("%Y-%m-%d")
|
| 148 |
|
| 149 |
# Filter by title
|
| 150 |
if title_search_query:
|
| 151 |
-
df = df[df["title"].str.contains(title_search_query, case=False)]
|
| 152 |
|
| 153 |
-
# Filter by abstract
|
| 154 |
if abstract_search_query:
|
| 155 |
results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
|
| 156 |
remaining_ids = set(df["arxiv_id"])
|
|
@@ -166,6 +184,7 @@ class PaperList:
|
|
| 166 |
found_ids.append(arxiv_id)
|
| 167 |
df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
|
| 168 |
|
|
|
|
| 169 |
df_prettified = self._prettifier(df).loc[:, self.column_names]
|
| 170 |
return df_prettified
|
| 171 |
|
|
@@ -176,10 +195,9 @@ class PaperManager:
|
|
| 176 |
def __init__(self, paper_list: PaperList, papers_per_page=30):
|
| 177 |
self.paper_list = paper_list
|
| 178 |
self.papers_per_page = papers_per_page
|
| 179 |
-
self.current_page = 1
|
| 180 |
self.sort_method = "hot" # Default sort method
|
| 181 |
self.sort_papers()
|
| 182 |
-
|
| 183 |
|
| 184 |
def calculate_score(self, row):
|
| 185 |
"""
|
|
@@ -187,10 +205,9 @@ class PaperManager:
|
|
| 187 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
| 188 |
"""
|
| 189 |
upvotes = row.get('π', 0)
|
| 190 |
-
published_at_str = row.get('
|
| 191 |
try:
|
| 192 |
-
published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d")
|
| 193 |
-
published_time = published_time.replace(tzinfo=timezone.utc)
|
| 194 |
except ValueError:
|
| 195 |
# If parsing fails, use current time to minimize the impact on sorting
|
| 196 |
published_time = datetime.datetime.now(timezone.utc)
|
|
@@ -199,7 +216,7 @@ class PaperManager:
|
|
| 199 |
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
|
| 200 |
|
| 201 |
# Avoid division by zero and apply the hotness formula
|
| 202 |
-
score = upvotes / ((time_diff_hours + 2) ** 1.5)
|
| 203 |
return score
|
| 204 |
|
| 205 |
def sort_papers(self):
|
|
@@ -209,7 +226,7 @@ class PaperManager:
|
|
| 209 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
| 210 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
| 211 |
elif self.sort_method == "new":
|
| 212 |
-
df_sorted = df.sort_values(by='
|
| 213 |
else:
|
| 214 |
df_sorted = df
|
| 215 |
|
|
@@ -245,10 +262,10 @@ class PaperManager:
|
|
| 245 |
title = row.get('title', 'No title')
|
| 246 |
paper_id = row.get('arxiv_id', '')
|
| 247 |
url = f"https://huggingface.co/papers/{paper_id}"
|
| 248 |
-
authors = 'Unknown'
|
| 249 |
upvotes = row.get('π', 0)
|
| 250 |
comments = row.get('π¬', 0)
|
| 251 |
-
published_time_str = row.get('
|
| 252 |
try:
|
| 253 |
published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
| 254 |
except ValueError:
|
|
@@ -572,6 +589,7 @@ with demo:
|
|
| 572 |
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
|
| 573 |
""")
|
| 574 |
|
|
|
|
| 575 |
# --- Launch the App ---
|
| 576 |
|
| 577 |
if __name__ == "__main__":
|
|
|
|
| 13 |
import datasets
|
| 14 |
import requests
|
| 15 |
|
| 16 |
+
from datetime import timezone # Ensure timezone is imported
|
| 17 |
|
| 18 |
# --- Data Loading and Processing ---
|
| 19 |
|
|
|
|
| 27 |
local_dir=INDEX_DIR_PATH,
|
| 28 |
)
|
| 29 |
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
| 30 |
+
# Initialize the retriever
|
| 31 |
abstract_retriever.search("LLM")
|
| 32 |
|
| 33 |
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
def get_df() -> pd.DataFrame:
|
| 59 |
+
# Load and merge datasets
|
| 60 |
+
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
|
| 61 |
+
df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
|
| 62 |
+
df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
|
| 63 |
+
|
| 64 |
+
# Reverse the DataFrame to have the latest papers first
|
| 65 |
df = df[::-1].reset_index(drop=True)
|
| 66 |
+
|
| 67 |
+
# Ensure 'date' is in datetime format and handle missing dates
|
| 68 |
+
df["date"] = pd.to_datetime(df["date"], errors='coerce')
|
| 69 |
+
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
| 70 |
+
|
| 71 |
+
# Prepare the DataFrame by removing 'abstract' and adding 'paper_page'
|
| 72 |
paper_info = []
|
| 73 |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
|
| 74 |
info = row.copy()
|
| 75 |
+
if "abstract" in info:
|
| 76 |
+
del info["abstract"]
|
| 77 |
info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
|
| 78 |
paper_info.append(info)
|
| 79 |
return pd.DataFrame(paper_info)
|
|
|
|
| 90 |
def create_link(text: str, url: str) -> str:
|
| 91 |
return f'<a href="{url}" target="_blank">{text}</a>'
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 94 |
new_rows = []
|
| 95 |
for _, row in df.iterrows():
|
| 96 |
+
# Handle authors: list of dicts or list of strings
|
| 97 |
+
if "authors" in row and isinstance(row["authors"], list):
|
| 98 |
+
authors = ', '.join([
|
| 99 |
+
author.get('name', '') if isinstance(author, dict) else str(author)
|
| 100 |
+
for author in row["authors"]
|
| 101 |
+
])
|
| 102 |
+
else:
|
| 103 |
+
authors = 'Unknown'
|
| 104 |
+
|
| 105 |
+
# Handle published_at: original date
|
| 106 |
+
published_at = row["date"] # Already formatted as "%Y-%m-%d"
|
| 107 |
+
|
| 108 |
+
# Handle date link
|
| 109 |
+
date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
|
| 110 |
+
|
| 111 |
new_row = {
|
| 112 |
"arxiv_id": row["arxiv_id"], # Include arxiv_id
|
| 113 |
+
"date_display": date_display, # For display
|
| 114 |
+
"published_at": published_at, # For internal calculations
|
| 115 |
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
|
| 116 |
"title": row["title"],
|
| 117 |
+
"authors": authors, # Include authors
|
| 118 |
+
"github": Prettifier.get_github_link(row.get("github", "")),
|
| 119 |
"π": row["upvotes"],
|
| 120 |
"π¬": row["num_comments"],
|
| 121 |
}
|
|
|
|
| 125 |
|
| 126 |
class PaperList:
|
| 127 |
COLUMN_INFO = [
|
| 128 |
+
["arxiv_id", "str"], # Added arxiv_id
|
| 129 |
+
["date_display", "markdown"],# For display
|
| 130 |
+
["published_at", "str"], # For internal use
|
| 131 |
["paper_page", "markdown"],
|
| 132 |
["title", "str"],
|
| 133 |
+
["authors", "str"], # Added authors
|
| 134 |
["github", "markdown"],
|
| 135 |
["π", "number"],
|
| 136 |
["π¬", "number"],
|
|
|
|
| 158 |
max_num_to_retrieve: int,
|
| 159 |
) -> pd.DataFrame:
|
| 160 |
df = self.df_raw.copy()
|
| 161 |
+
df["date"] = pd.to_datetime(df["date"], errors='coerce')
|
| 162 |
|
| 163 |
# Filter by date
|
| 164 |
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
|
| 165 |
+
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
| 166 |
|
| 167 |
# Filter by title
|
| 168 |
if title_search_query:
|
| 169 |
+
df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
|
| 170 |
|
| 171 |
+
# Filter by abstract using RAG
|
| 172 |
if abstract_search_query:
|
| 173 |
results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
|
| 174 |
remaining_ids = set(df["arxiv_id"])
|
|
|
|
| 184 |
found_ids.append(arxiv_id)
|
| 185 |
df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
|
| 186 |
|
| 187 |
+
# Prettify the DataFrame
|
| 188 |
df_prettified = self._prettifier(df).loc[:, self.column_names]
|
| 189 |
return df_prettified
|
| 190 |
|
|
|
|
| 195 |
def __init__(self, paper_list: PaperList, papers_per_page=30):
|
| 196 |
self.paper_list = paper_list
|
| 197 |
self.papers_per_page = papers_per_page
|
|
|
|
| 198 |
self.sort_method = "hot" # Default sort method
|
| 199 |
self.sort_papers()
|
| 200 |
+
# 'current_page' and 'total_pages' are set in 'sort_papers()'
|
| 201 |
|
| 202 |
def calculate_score(self, row):
|
| 203 |
"""
|
|
|
|
| 205 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
| 206 |
"""
|
| 207 |
upvotes = row.get('π', 0)
|
| 208 |
+
published_at_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
| 209 |
try:
|
| 210 |
+
published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
|
|
|
| 211 |
except ValueError:
|
| 212 |
# If parsing fails, use current time to minimize the impact on sorting
|
| 213 |
published_time = datetime.datetime.now(timezone.utc)
|
|
|
|
| 216 |
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
|
| 217 |
|
| 218 |
# Avoid division by zero and apply the hotness formula
|
| 219 |
+
score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0
|
| 220 |
return score
|
| 221 |
|
| 222 |
def sort_papers(self):
|
|
|
|
| 226 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
| 227 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
| 228 |
elif self.sort_method == "new":
|
| 229 |
+
df_sorted = df.sort_values(by='published_at', ascending=False)
|
| 230 |
else:
|
| 231 |
df_sorted = df
|
| 232 |
|
|
|
|
| 262 |
title = row.get('title', 'No title')
|
| 263 |
paper_id = row.get('arxiv_id', '')
|
| 264 |
url = f"https://huggingface.co/papers/{paper_id}"
|
| 265 |
+
authors = row.get('authors', 'Unknown')
|
| 266 |
upvotes = row.get('π', 0)
|
| 267 |
comments = row.get('π¬', 0)
|
| 268 |
+
published_time_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
| 269 |
try:
|
| 270 |
published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
| 271 |
except ValueError:
|
|
|
|
| 589 |
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
|
| 590 |
""")
|
| 591 |
|
| 592 |
+
|
| 593 |
# --- Launch the App ---
|
| 594 |
|
| 595 |
if __name__ == "__main__":
|