Spaces:
Running
Running
Enzo Reis de Oliveira
commited on
Commit
·
862c2e6
1
Parent(s):
c9e9b6b
Adding limit and message
Browse files
app.py
CHANGED
|
@@ -36,6 +36,13 @@ def process_inputs(smiles: str, file_obj):
|
|
| 36 |
smiles_col = smiles_cols[0]
|
| 37 |
smiles_list = df_in[smiles_col].astype(str).tolist()
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
out_records = []
|
| 40 |
invalid_smiles = []
|
| 41 |
embed_dim = None
|
|
@@ -44,38 +51,30 @@ def process_inputs(smiles: str, file_obj):
|
|
| 44 |
for sm in smiles_list:
|
| 45 |
try:
|
| 46 |
vec = model.encode(sm, return_torch=True)[0].tolist()
|
| 47 |
-
# guarda dimensão do vetor na primeira vez
|
| 48 |
if embed_dim is None:
|
| 49 |
embed_dim = len(vec)
|
| 50 |
-
# monta registro válido
|
| 51 |
record = {"smiles": sm}
|
| 52 |
record.update({f"dim_{i}": v for i, v in enumerate(vec)})
|
| 53 |
except Exception:
|
| 54 |
-
# marca como inválido
|
| 55 |
invalid_smiles.append(sm)
|
| 56 |
-
# se já souber quantos dims, preenche com None
|
| 57 |
if embed_dim is not None:
|
| 58 |
record = {"smiles": f"SMILES {sm} was invalid"}
|
| 59 |
record.update({f"dim_{i}": None for i in range(embed_dim)})
|
| 60 |
else:
|
| 61 |
-
# ainda não sabemos quantos dims: só guarda smiles
|
| 62 |
record = {"smiles": f"SMILES {sm} was invalid"}
|
| 63 |
out_records.append(record)
|
| 64 |
|
| 65 |
-
# converte para DataFrame (vai unificar todas as colunas)
|
| 66 |
out_df = pd.DataFrame(out_records)
|
| 67 |
out_df.to_csv("embeddings.csv", index=False)
|
| 68 |
|
| 69 |
-
# monta mensagem de saída
|
| 70 |
total = len(smiles_list)
|
| 71 |
valid = total - len(invalid_smiles)
|
|
|
|
| 72 |
if invalid_smiles:
|
| 73 |
-
invalid_count = len(invalid_smiles)
|
| 74 |
msg = (
|
| 75 |
f"{valid} SMILES processed successfully. "
|
| 76 |
-
f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} "
|
| 77 |
-
f"
|
| 78 |
-
+ "\n".join(f"- {sm}" for sm in invalid_smiles)
|
| 79 |
)
|
| 80 |
else:
|
| 81 |
msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."
|
|
@@ -104,9 +103,15 @@ with gr.Blocks() as demo:
|
|
| 104 |
gr.Markdown(
|
| 105 |
"""
|
| 106 |
# SMI-TED-Embeddings-Extraction
|
|
|
|
| 107 |
**Single mode:** paste a SMILES string in the left box.
|
| 108 |
**Batch mode:** upload a CSV file where each row has a SMILES in the first column.
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
"""
|
| 111 |
)
|
| 112 |
|
|
|
|
| 36 |
smiles_col = smiles_cols[0]
|
| 37 |
smiles_list = df_in[smiles_col].astype(str).tolist()
|
| 38 |
|
| 39 |
+
# **novo**: limite de 1000 SMILES
|
| 40 |
+
if len(smiles_list) > 1000:
|
| 41 |
+
return (
|
| 42 |
+
f"Error: Maximum 1000 SMILES allowed per batch (you provided {len(smiles_list)}).",
|
| 43 |
+
gr.update(visible=False),
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
out_records = []
|
| 47 |
invalid_smiles = []
|
| 48 |
embed_dim = None
|
|
|
|
| 51 |
for sm in smiles_list:
|
| 52 |
try:
|
| 53 |
vec = model.encode(sm, return_torch=True)[0].tolist()
|
|
|
|
| 54 |
if embed_dim is None:
|
| 55 |
embed_dim = len(vec)
|
|
|
|
| 56 |
record = {"smiles": sm}
|
| 57 |
record.update({f"dim_{i}": v for i, v in enumerate(vec)})
|
| 58 |
except Exception:
|
|
|
|
| 59 |
invalid_smiles.append(sm)
|
|
|
|
| 60 |
if embed_dim is not None:
|
| 61 |
record = {"smiles": f"SMILES {sm} was invalid"}
|
| 62 |
record.update({f"dim_{i}": None for i in range(embed_dim)})
|
| 63 |
else:
|
|
|
|
| 64 |
record = {"smiles": f"SMILES {sm} was invalid"}
|
| 65 |
out_records.append(record)
|
| 66 |
|
|
|
|
| 67 |
out_df = pd.DataFrame(out_records)
|
| 68 |
out_df.to_csv("embeddings.csv", index=False)
|
| 69 |
|
|
|
|
| 70 |
total = len(smiles_list)
|
| 71 |
valid = total - len(invalid_smiles)
|
| 72 |
+
invalid_count = len(invalid_smiles)
|
| 73 |
if invalid_smiles:
|
|
|
|
| 74 |
msg = (
|
| 75 |
f"{valid} SMILES processed successfully. "
|
| 76 |
+
f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} could not be parsed by RDKit:\n"
|
| 77 |
+
+ "\n".join(f"- {s}" for s in invalid_smiles)
|
|
|
|
| 78 |
)
|
| 79 |
else:
|
| 80 |
msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."
|
|
|
|
| 103 |
gr.Markdown(
|
| 104 |
"""
|
| 105 |
# SMI-TED-Embeddings-Extraction
|
| 106 |
+
|
| 107 |
**Single mode:** paste a SMILES string in the left box.
|
| 108 |
**Batch mode:** upload a CSV file where each row has a SMILES in the first column.
|
| 109 |
+
- **Maximum 1000 SMILES per batch.** Processing time increases with batch size due to Hugging Face environment limits.
|
| 110 |
+
_This is just a demo environment; for heavy-duty usage, please visit:_
|
| 111 |
+
https://github.com/IBM/materials/tree/main/models/smi_ted
|
| 112 |
+
to download the model and run your own experiments.
|
| 113 |
+
|
| 114 |
+
- In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns.
|
| 115 |
"""
|
| 116 |
)
|
| 117 |
|