Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -66,6 +66,31 @@ def split_text(text, max_tokens=500):
|
|
| 66 |
|
| 67 |
return chunks
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def remove_punctuation(text):
|
| 70 |
return re.sub(r'[^\w\s]', '', text)
|
| 71 |
|
|
@@ -165,6 +190,9 @@ class CombinedProcessor:
|
|
| 165 |
|
| 166 |
bibtex_entry = create_bibtex_entry(bibtex_data)
|
| 167 |
bibtex_entries.append(bibtex_entry)
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
# Join BibTeX entries with HTML formatting
|
| 170 |
formatted_entries = [html.escape(entry) for entry in bibtex_entries]
|
|
|
|
| 66 |
|
| 67 |
return chunks
|
| 68 |
|
| 69 |
+
def disambiguate_bibtex_ids(bibtex_entries):
|
| 70 |
+
id_count = {}
|
| 71 |
+
disambiguated_entries = []
|
| 72 |
+
|
| 73 |
+
for entry in bibtex_entries:
|
| 74 |
+
# Extract the current ID
|
| 75 |
+
match = re.search(r'@\w+{(\w+),', entry)
|
| 76 |
+
if not match:
|
| 77 |
+
disambiguated_entries.append(entry)
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
original_id = match.group(1)
|
| 81 |
+
|
| 82 |
+
# Check if this ID has been seen before
|
| 83 |
+
if original_id in id_count:
|
| 84 |
+
id_count[original_id] += 1
|
| 85 |
+
new_id = f"{original_id}{chr(96 + id_count[original_id])}" # 'a', 'b', 'c', etc.
|
| 86 |
+
new_entry = re.sub(r'(@\w+{)(\w+)(,)', f'\\1{new_id}\\3', entry, 1)
|
| 87 |
+
disambiguated_entries.append(new_entry)
|
| 88 |
+
else:
|
| 89 |
+
id_count[original_id] = 0
|
| 90 |
+
disambiguated_entries.append(entry)
|
| 91 |
+
|
| 92 |
+
return disambiguated_entries
|
| 93 |
+
|
| 94 |
def remove_punctuation(text):
|
| 95 |
return re.sub(r'[^\w\s]', '', text)
|
| 96 |
|
|
|
|
| 190 |
|
| 191 |
bibtex_entry = create_bibtex_entry(bibtex_data)
|
| 192 |
bibtex_entries.append(bibtex_entry)
|
| 193 |
+
|
| 194 |
+
#Disambiguation to avoid duplicate ids.
|
| 195 |
+
bibtex_entries = disambiguate_bibtex_ids(bibtex_entries)
|
| 196 |
|
| 197 |
# Join BibTeX entries with HTML formatting
|
| 198 |
formatted_entries = [html.escape(entry) for entry in bibtex_entries]
|