Spaces:
Runtime error
Runtime error
Commit
·
264c5ec
1
Parent(s):
789383a
fix
Browse files- searcher/sementic_search.py +11 -8
searcher/sementic_search.py
CHANGED
|
@@ -132,6 +132,15 @@ class SementicSearcher:
|
|
| 132 |
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
| 133 |
|
| 134 |
def read_arxiv_from_path(self, pdf_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
try:
|
| 136 |
article_dict = scipdf.parse_pdf_to_dict(pdf_path)
|
| 137 |
except Exception as e:
|
|
@@ -285,10 +294,7 @@ Abstract: {paper['abstract']}
|
|
| 285 |
abstract = result['abstract']
|
| 286 |
citationCount = result['citationCount']
|
| 287 |
year = result['year']
|
| 288 |
-
|
| 289 |
-
article = scipdf.parse_pdf_to_dict(content)
|
| 290 |
-
except Exception as e:
|
| 291 |
-
article = None
|
| 292 |
if not article:
|
| 293 |
continue
|
| 294 |
final_results.append(Result(title,abstract,article,citationCount,year))
|
|
@@ -357,10 +363,7 @@ Abstract: {paper['abstract']}
|
|
| 357 |
url = paper[2]
|
| 358 |
content = await self.download_pdf_async(url)
|
| 359 |
if content:
|
| 360 |
-
|
| 361 |
-
article = scipdf.parse_pdf_to_dict(content)
|
| 362 |
-
except Exception as e:
|
| 363 |
-
article = None
|
| 364 |
if not article:
|
| 365 |
continue
|
| 366 |
result = Result(paper[0],paper[1],article,paper[3],paper[4])
|
|
|
|
| 132 |
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
| 133 |
|
| 134 |
def read_arxiv_from_path(self, pdf_path):
|
| 135 |
+
def is_pdf(binary_data):
|
| 136 |
+
pdf_header = b'%PDF-'
|
| 137 |
+
return binary_data.startswith(pdf_header)
|
| 138 |
+
try:
|
| 139 |
+
flag = is_pdf(pdf_path)
|
| 140 |
+
if not flag:
|
| 141 |
+
return None
|
| 142 |
+
except Exception as e:
|
| 143 |
+
pass
|
| 144 |
try:
|
| 145 |
article_dict = scipdf.parse_pdf_to_dict(pdf_path)
|
| 146 |
except Exception as e:
|
|
|
|
| 294 |
abstract = result['abstract']
|
| 295 |
citationCount = result['citationCount']
|
| 296 |
year = result['year']
|
| 297 |
+
article = self.read_arxiv_from_path(content)
|
|
|
|
|
|
|
|
|
|
| 298 |
if not article:
|
| 299 |
continue
|
| 300 |
final_results.append(Result(title,abstract,article,citationCount,year))
|
|
|
|
| 363 |
url = paper[2]
|
| 364 |
content = await self.download_pdf_async(url)
|
| 365 |
if content:
|
| 366 |
+
article = self.read_arxiv_from_path(content)
|
|
|
|
|
|
|
|
|
|
| 367 |
if not article:
|
| 368 |
continue
|
| 369 |
result = Result(paper[0],paper[1],article,paper[3],paper[4])
|