Spaces:
Sleeping
Sleeping
Commit
·
05f3fd9
1
Parent(s):
e547dc0
deepnote update
Browse files
util.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
from langchain.docstore.document import Document
|
|
|
|
| 3 |
|
| 4 |
SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
|
| 5 |
SHEET_URL_Y = "/edit#gid="
|
|
@@ -80,13 +81,15 @@ def duplicate_rows_with_synonyms(df: pd.DataFrame, column: str, synonyms: list[l
|
|
| 80 |
new_rows = []
|
| 81 |
for index, row in df.iterrows():
|
| 82 |
new_rows.append(row)
|
|
|
|
| 83 |
for synonym_list in synonyms:
|
| 84 |
-
for
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
| 88 |
new_row = row.copy()
|
| 89 |
-
new_row[column] =
|
| 90 |
new_rows.append(new_row)
|
| 91 |
new_df = pd.DataFrame(new_rows, columns=df.columns)
|
| 92 |
new_df = new_df.reset_index(drop=True)
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
from langchain.docstore.document import Document
|
| 3 |
+
import re
|
| 4 |
|
| 5 |
SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
|
| 6 |
SHEET_URL_Y = "/edit#gid="
|
|
|
|
| 81 |
new_rows = []
|
| 82 |
for index, row in df.iterrows():
|
| 83 |
new_rows.append(row)
|
| 84 |
+
text = row[column]
|
| 85 |
for synonym_list in synonyms:
|
| 86 |
+
for synonym in synonym_list:
|
| 87 |
+
pattern = r'\b(?:{}|{}(?:s|es|ed|ing)?)\b'.format(synonym, synonym)
|
| 88 |
+
if re.search(pattern, text):
|
| 89 |
+
for replacement in synonym_list:
|
| 90 |
+
if replacement != synonym:
|
| 91 |
new_row = row.copy()
|
| 92 |
+
new_row[column] = re.sub(pattern, replacement, text)
|
| 93 |
new_rows.append(new_row)
|
| 94 |
new_df = pd.DataFrame(new_rows, columns=df.columns)
|
| 95 |
new_df = new_df.reset_index(drop=True)
|