Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,7 +6,6 @@ import pandas as pd
|
|
| 6 |
import tqdm.auto
|
| 7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 8 |
from huggingface_hub import HfApi
|
| 9 |
-
from ragatouille import RAGPretrainedModel
|
| 10 |
|
| 11 |
import gradio as gr
|
| 12 |
from gradio_calendar import Calendar
|
|
@@ -21,39 +20,30 @@ api = HfApi()
|
|
| 21 |
|
| 22 |
INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
|
| 23 |
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
|
| 24 |
-
api.snapshot_download(
|
| 25 |
-
repo_id=INDEX_REPO_ID,
|
| 26 |
-
repo_type="dataset",
|
| 27 |
-
local_dir=INDEX_DIR_PATH,
|
| 28 |
-
)
|
| 29 |
-
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
| 30 |
-
# Initialize the retriever
|
| 31 |
-
abstract_retriever.search("LLM")
|
| 32 |
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
|
| 35 |
-
global abstract_retriever
|
| 36 |
-
|
| 37 |
-
api.snapshot_download(
|
| 38 |
-
repo_id=INDEX_REPO_ID,
|
| 39 |
-
repo_type="dataset",
|
| 40 |
-
local_dir=INDEX_DIR_PATH,
|
| 41 |
-
)
|
| 42 |
-
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
| 43 |
-
abstract_retriever.search("LLM")
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# Scheduler for updating abstract index every hour
|
| 47 |
-
scheduler_abstract
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
|
|
|
|
| 57 |
|
| 58 |
def get_df() -> pd.DataFrame:
|
| 59 |
# Load and merge datasets
|
|
@@ -154,7 +144,6 @@ class PaperList:
|
|
| 154 |
start_date: datetime.datetime,
|
| 155 |
end_date: datetime.datetime,
|
| 156 |
title_search_query: str,
|
| 157 |
-
abstract_search_query: str,
|
| 158 |
max_num_to_retrieve: int,
|
| 159 |
) -> pd.DataFrame:
|
| 160 |
df = self.df_raw.copy()
|
|
@@ -168,21 +157,7 @@ class PaperList:
|
|
| 168 |
if title_search_query:
|
| 169 |
df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
|
| 170 |
|
| 171 |
-
#
|
| 172 |
-
if abstract_search_query:
|
| 173 |
-
results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
|
| 174 |
-
remaining_ids = set(df["arxiv_id"])
|
| 175 |
-
found_id_set = set()
|
| 176 |
-
found_ids = []
|
| 177 |
-
for x in results:
|
| 178 |
-
arxiv_id = x["document_id"]
|
| 179 |
-
if arxiv_id not in remaining_ids:
|
| 180 |
-
continue
|
| 181 |
-
if arxiv_id in found_id_set:
|
| 182 |
-
continue
|
| 183 |
-
found_id_set.add(arxiv_id)
|
| 184 |
-
found_ids.append(arxiv_id)
|
| 185 |
-
df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
|
| 186 |
|
| 187 |
# Prettify the DataFrame
|
| 188 |
df_prettified = self._prettifier(df).loc[:, self.column_names]
|
|
@@ -205,7 +180,7 @@ class PaperManager:
|
|
| 205 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
| 206 |
"""
|
| 207 |
upvotes = row.get('👍', 0)
|
| 208 |
-
published_at_str = row.get('
|
| 209 |
try:
|
| 210 |
published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
| 211 |
except ValueError:
|
|
@@ -226,7 +201,7 @@ class PaperManager:
|
|
| 226 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
| 227 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
| 228 |
elif self.sort_method == "new":
|
| 229 |
-
df_sorted = df.sort_values(by='
|
| 230 |
else:
|
| 231 |
df_sorted = df
|
| 232 |
|
|
|
|
| 6 |
import tqdm.auto
|
| 7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 8 |
from huggingface_hub import HfApi
|
|
|
|
| 9 |
|
| 10 |
import gradio as gr
|
| 11 |
from gradio_calendar import Calendar
|
|
|
|
| 20 |
|
| 21 |
INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
|
| 22 |
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
# Removed ragatouille and abstract_retriever initialization
|
| 25 |
+
# If INDEX_REPO_ID is not used elsewhere, consider removing related lines
|
| 26 |
|
| 27 |
+
# Removed abstract_retriever initialization and search
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
def update_abstract_index() -> None:
|
| 30 |
+
"""
|
| 31 |
+
Removed abstract_retriever update functionality since ragatouille is no longer used.
|
| 32 |
+
"""
|
| 33 |
+
pass # No operation needed
|
| 34 |
|
| 35 |
# Scheduler for updating abstract index every hour
|
| 36 |
+
# Removed scheduler_abstract as it's no longer necessary
|
| 37 |
+
# If INDEX_REPO_ID is not used elsewhere, consider removing the download
|
| 38 |
+
|
| 39 |
+
# Optionally, remove the snapshot_download if the index is not needed
|
| 40 |
+
# api.snapshot_download(
|
| 41 |
+
# repo_id=INDEX_REPO_ID,
|
| 42 |
+
# repo_type="dataset",
|
| 43 |
+
# local_dir=INDEX_DIR_PATH,
|
| 44 |
+
# )
|
| 45 |
|
| 46 |
+
# --- DataFrame Preparation ---
|
| 47 |
|
| 48 |
def get_df() -> pd.DataFrame:
|
| 49 |
# Load and merge datasets
|
|
|
|
| 144 |
start_date: datetime.datetime,
|
| 145 |
end_date: datetime.datetime,
|
| 146 |
title_search_query: str,
|
|
|
|
| 147 |
max_num_to_retrieve: int,
|
| 148 |
) -> pd.DataFrame:
|
| 149 |
df = self.df_raw.copy()
|
|
|
|
| 157 |
if title_search_query:
|
| 158 |
df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
|
| 159 |
|
| 160 |
+
# Removed abstract_search_query filtering since ragatouille is no longer used
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
# Prettify the DataFrame
|
| 163 |
df_prettified = self._prettifier(df).loc[:, self.column_names]
|
|
|
|
| 180 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
| 181 |
"""
|
| 182 |
upvotes = row.get('👍', 0)
|
| 183 |
+
published_at_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) # **FIX** Changed from 'published_at' to 'date'
|
| 184 |
try:
|
| 185 |
published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
| 186 |
except ValueError:
|
|
|
|
| 201 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
| 202 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
| 203 |
elif self.sort_method == "new":
|
| 204 |
+
df_sorted = df.sort_values(by='date', ascending=False) # **FIX** Changed from 'published_at' to 'date'
|
| 205 |
else:
|
| 206 |
df_sorted = df
|
| 207 |
|