| import pandas as pd | |
| import arxiv | |
| import requests | |
| from pinecone import Pinecone, ServerlessSpec | |
| import logging | |
| import os | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| os.chdir(script_dir) | |
| def get_zotero_ids(api_key, library_id, tag): | |
| base_url = 'https://api.zotero.org' | |
| suffix = '/users/'+ library_id +'/items?tag='+ tag | |
| header = {'Authorization': 'Bearer '+ api_key} | |
| request = requests.get(base_url + suffix, headers= header) | |
| return [data['data']['archiveID'].replace('arXiv:', '') for data in request.json()] | |
| def get_arxiv_papers(ids = None, category = None, comment = None): | |
| logging.getLogger('arxiv').setLevel(logging.WARNING) | |
| client = arxiv.Client() | |
| if category is None: | |
| search = arxiv.Search( | |
| id_list= ids, | |
| max_results= len(ids), | |
| ) | |
| else : | |
| if comment is None: | |
| custom_query = f'cat:{category}' | |
| else: | |
| custom_query = f'cat:{category} AND co:{comment}' | |
| search = arxiv.Search( | |
| query = custom_query, | |
| max_results= 15, | |
| sort_by= arxiv.SortCriterion.SubmittedDate | |
| ) | |
| if ids is None and category is None: | |
| raise ValueError('not a valid query') | |
| df = pd.DataFrame({'Title': [result.title for result in client.results(search)], | |
| 'Abstract': [result.summary.replace('\n', ' ') for result in client.results(search)], | |
| 'Date': [result.published.date().strftime('%Y-%m-%d') for result in client.results(search)], | |
| 'id': [result.entry_id for result in client.results(search)]}) | |
| if ids: | |
| df.to_csv('arxiv-scrape.csv', index = False) | |
| return df | |
| def get_hf_embeddings(api_key, df): | |
| title_abs = [title + '[SEP]' + abstract for title,abstract in zip(df['Title'], df['Abstract'])] | |
| API_URL = "https://api-inference.huggingface.co/models/malteos/scincl" | |
| headers = {"Authorization": f"Bearer {api_key}"} | |
| response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "wait_for_model": False}) | |
| print(str(response.status_code) + 'This part needs an update, causing KeyError 0') | |
| if response.status_code == 503: | |
| response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "wait_for_model": True}) | |
| embeddings = response.json() | |
| return embeddings, len(embeddings[0]) | |
| def upload_to_pinecone(api_key, index, namespace, embeddings, dim, df): | |
| input = [{'id': df['id'][i], 'values': embeddings[i]} for i in range(len(embeddings))] | |
| pc = Pinecone(api_key = api_key) | |
| if index in pc.list_indexes().names(): | |
| while True: | |
| logging.warning(f'Index name : {index} already exists.') | |
| return f'Index name : {index} already exists' | |
| pc.create_index( | |
| name=index, | |
| dimension=dim, | |
| metric="cosine", | |
| spec=ServerlessSpec( | |
| cloud='aws', | |
| region='us-east-1' | |
| ) | |
| ) | |
| index = pc.Index(index) | |
| return index.upsert(vectors=input, namespace=namespace) | |
| def get_new_papers(df): | |
| df_main = pd.read_csv('arxiv-scrape.csv') | |
| df.reset_index(inplace=True) | |
| df.drop(columns=['index'], inplace=True) | |
| union_df = df.merge(df_main, how='left', indicator=True) | |
| df = union_df[union_df['_merge'] == 'left_only'].drop(columns=['_merge']) | |
| if df.empty: | |
| return 'No New Papers Found' | |
| else: | |
| df_main = pd.concat([df_main, df], ignore_index= True) | |
| df_main.drop_duplicates(inplace= True) | |
| df_main.to_csv('arxiv-scrape.csv', index = False) | |
| return df | |
| def recommend_papers(api_key, index, namespace, embeddings, df, threshold): | |
| pc = Pinecone(api_key = api_key) | |
| if index in pc.list_indexes().names(): | |
| index = pc.Index(index) | |
| else: | |
| raise ValueError(f"{index} doesnt exist. Project isnt initialized properly") | |
| results = [] | |
| score_threshold = threshold | |
| for i,embedding in enumerate(embeddings): | |
| query = embedding | |
| result = index.query(namespace=namespace,vector=query,top_k=3,include_values=False) | |
| sum_score = sum(match['score'] for match in result['matches']) | |
| if sum_score > score_threshold: | |
| results.append(f"Paper-URL : [{df['id'][i]}]({df['id'][i]}) with score: {sum_score / 3} <br />") | |
| if results: | |
| return '\n'.join(results) | |
| else: | |
| return 'No Interesting Paper' | |