validate antibody names
Browse files- constants.py +2 -2
- data/antibody_names.csv +138 -0
- test/conftest.py +2 -6
- test/test_validation.py +20 -17
- validation.py +16 -6
constants.py
CHANGED
|
@@ -4,7 +4,7 @@ Constants for the Antibody Developability Benchmark
|
|
| 4 |
|
| 5 |
import os
|
| 6 |
from huggingface_hub import HfApi
|
| 7 |
-
|
| 8 |
|
| 9 |
ASSAY_LIST = ["AC-SINS_pH7.4", "PR_CHO", "HIC", "Tm2", "Titer"]
|
| 10 |
ASSAY_RENAME = {
|
|
@@ -32,11 +32,11 @@ ASSAY_EMOJIS = {
|
|
| 32 |
# Input CSV file requirements
|
| 33 |
MINIMAL_NUMBER_OF_ROWS: int = 50
|
| 34 |
REQUIRED_COLUMNS: list[str] = [
|
| 35 |
-
"antibody_id",
|
| 36 |
"antibody_name",
|
| 37 |
"vh_protein_sequence",
|
| 38 |
"vl_protein_sequence",
|
| 39 |
] + ASSAY_LIST
|
|
|
|
| 40 |
|
| 41 |
# Huggingface API
|
| 42 |
TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
| 4 |
|
| 5 |
import os
|
| 6 |
from huggingface_hub import HfApi
|
| 7 |
+
import pandas as pd
|
| 8 |
|
| 9 |
ASSAY_LIST = ["AC-SINS_pH7.4", "PR_CHO", "HIC", "Tm2", "Titer"]
|
| 10 |
ASSAY_RENAME = {
|
|
|
|
| 32 |
# Input CSV file requirements
|
| 33 |
MINIMAL_NUMBER_OF_ROWS: int = 50
|
| 34 |
REQUIRED_COLUMNS: list[str] = [
|
|
|
|
| 35 |
"antibody_name",
|
| 36 |
"vh_protein_sequence",
|
| 37 |
"vl_protein_sequence",
|
| 38 |
] + ASSAY_LIST
|
| 39 |
+
ANTIBODY_NAMES = pd.read_csv("data/antibody_names.csv")["antibody_name"].tolist()
|
| 40 |
|
| 41 |
# Huggingface API
|
| 42 |
TOKEN = os.environ.get("HF_TOKEN")
|
data/antibody_names.csv
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
antibody_name
|
| 2 |
+
abituzumab
|
| 3 |
+
abrilumab
|
| 4 |
+
adalimumab
|
| 5 |
+
alemtuzumab
|
| 6 |
+
alirocumab
|
| 7 |
+
anifrolumab
|
| 8 |
+
atezolizumab
|
| 9 |
+
bapineuzumab
|
| 10 |
+
basiliximab
|
| 11 |
+
bavituximab
|
| 12 |
+
belimumab
|
| 13 |
+
benralizumab
|
| 14 |
+
bevacizumab
|
| 15 |
+
bimagrumab
|
| 16 |
+
blosozumab
|
| 17 |
+
bococizumab
|
| 18 |
+
brentuximab
|
| 19 |
+
briakinumab
|
| 20 |
+
brodalumab
|
| 21 |
+
canakinumab
|
| 22 |
+
carlumab
|
| 23 |
+
certolizumab
|
| 24 |
+
cetuximab
|
| 25 |
+
cixutumumab
|
| 26 |
+
clazakizumab
|
| 27 |
+
codrituzumab
|
| 28 |
+
crenezumab
|
| 29 |
+
dacetuzumab
|
| 30 |
+
daclizumab
|
| 31 |
+
dalotuzumab
|
| 32 |
+
daratumumab
|
| 33 |
+
denosumab
|
| 34 |
+
dinutuximab
|
| 35 |
+
drozitumab
|
| 36 |
+
duligotuzumab
|
| 37 |
+
dupilumab
|
| 38 |
+
eculizumab
|
| 39 |
+
efalizumab
|
| 40 |
+
eldelumab
|
| 41 |
+
elotuzumab
|
| 42 |
+
emibetuzumab
|
| 43 |
+
enokizumab
|
| 44 |
+
epratuzumab
|
| 45 |
+
etrolizumab
|
| 46 |
+
evolocumab
|
| 47 |
+
farletuzumab
|
| 48 |
+
fasinumab
|
| 49 |
+
fezakinumab
|
| 50 |
+
ficlatuzumab
|
| 51 |
+
figitumumab
|
| 52 |
+
fletikumab
|
| 53 |
+
foralumab
|
| 54 |
+
fresolimumab
|
| 55 |
+
fulranumab
|
| 56 |
+
galiximab
|
| 57 |
+
ganitumab
|
| 58 |
+
gantenerumab
|
| 59 |
+
gemtuzumab
|
| 60 |
+
gevokizumab
|
| 61 |
+
girentuximab
|
| 62 |
+
glembatumumab
|
| 63 |
+
golimumab
|
| 64 |
+
guselkumab
|
| 65 |
+
ibalizumab
|
| 66 |
+
imgatuzumab
|
| 67 |
+
infliximab
|
| 68 |
+
inotuzumab
|
| 69 |
+
ipilimumab
|
| 70 |
+
ixekizumab
|
| 71 |
+
lampalizumab
|
| 72 |
+
lebrikizumab
|
| 73 |
+
lenzilumab
|
| 74 |
+
lintuzumab
|
| 75 |
+
lirilumab
|
| 76 |
+
lumiliximab
|
| 77 |
+
matuzumab
|
| 78 |
+
mavrilimumab
|
| 79 |
+
mepolizumab
|
| 80 |
+
mogamulizumab
|
| 81 |
+
motavizumab
|
| 82 |
+
muromonab
|
| 83 |
+
natalizumab
|
| 84 |
+
necitumumab
|
| 85 |
+
nimotuzumab
|
| 86 |
+
nivolumab
|
| 87 |
+
obinutuzumab
|
| 88 |
+
ocrelizumab
|
| 89 |
+
ofatumumab
|
| 90 |
+
olaratumab
|
| 91 |
+
olokizumab
|
| 92 |
+
omalizumab
|
| 93 |
+
onartuzumab
|
| 94 |
+
otelixizumab
|
| 95 |
+
otlertuzumab
|
| 96 |
+
ozanezumab
|
| 97 |
+
palivizumab
|
| 98 |
+
panitumumab
|
| 99 |
+
panobacumab
|
| 100 |
+
parsatuzumab
|
| 101 |
+
patritumab
|
| 102 |
+
pembrolizumab
|
| 103 |
+
pertuzumab
|
| 104 |
+
pinatuzumab
|
| 105 |
+
polatuzumab
|
| 106 |
+
ponezumab
|
| 107 |
+
radretumab
|
| 108 |
+
ramucirumab
|
| 109 |
+
ranibizumab
|
| 110 |
+
reslizumab
|
| 111 |
+
rilotumumab
|
| 112 |
+
rituximab
|
| 113 |
+
robatumumab
|
| 114 |
+
romosozumab
|
| 115 |
+
sarilumab
|
| 116 |
+
secukinumab
|
| 117 |
+
seribantumab
|
| 118 |
+
sifalimumab
|
| 119 |
+
siltuximab
|
| 120 |
+
simtuzumab
|
| 121 |
+
sirukumab
|
| 122 |
+
tabalumab
|
| 123 |
+
tanezumab
|
| 124 |
+
teplizumab
|
| 125 |
+
tigatuzumab
|
| 126 |
+
tildrakizumab
|
| 127 |
+
tocilizumab
|
| 128 |
+
tovetumab
|
| 129 |
+
tralokinumab
|
| 130 |
+
trastuzumab
|
| 131 |
+
tremelimumab
|
| 132 |
+
urelumab
|
| 133 |
+
ustekinumab
|
| 134 |
+
vedolizumab
|
| 135 |
+
veltuzumab
|
| 136 |
+
visilizumab
|
| 137 |
+
zalutumumab
|
| 138 |
+
zanolimumab
|
test/conftest.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
import pytest
|
| 2 |
import pandas as pd
|
| 3 |
-
from constants import MINIMAL_NUMBER_OF_ROWS, ASSAY_LIST
|
| 4 |
|
| 5 |
|
| 6 |
@pytest.fixture
|
| 7 |
def valid_csv_data():
|
| 8 |
-
"""Fixture providing valid CSV data with all required columns"""
|
| 9 |
return {
|
| 10 |
-
"
|
| 11 |
-
"antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
|
| 12 |
"vh_protein_sequence": [
|
| 13 |
"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"
|
| 14 |
]
|
|
@@ -23,11 +21,9 @@ def valid_csv_data():
|
|
| 23 |
|
| 24 |
@pytest.fixture
|
| 25 |
def valid_input_dataframe(valid_csv_data):
|
| 26 |
-
"""Fixture providing a valid input dataframe"""
|
| 27 |
return pd.DataFrame(valid_csv_data)
|
| 28 |
|
| 29 |
|
| 30 |
@pytest.fixture
|
| 31 |
def valid_csv_content(valid_input_dataframe):
|
| 32 |
-
"""Fixture providing valid CSV content as string"""
|
| 33 |
return valid_input_dataframe.to_csv(index=False)
|
|
|
|
| 1 |
import pytest
|
| 2 |
import pandas as pd
|
| 3 |
+
from constants import MINIMAL_NUMBER_OF_ROWS, ASSAY_LIST, ANTIBODY_NAMES
|
| 4 |
|
| 5 |
|
| 6 |
@pytest.fixture
|
| 7 |
def valid_csv_data():
|
|
|
|
| 8 |
return {
|
| 9 |
+
"antibody_name": ANTIBODY_NAMES[:MINIMAL_NUMBER_OF_ROWS],
|
|
|
|
| 10 |
"vh_protein_sequence": [
|
| 11 |
"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"
|
| 12 |
]
|
|
|
|
| 21 |
|
| 22 |
@pytest.fixture
|
| 23 |
def valid_input_dataframe(valid_csv_data):
|
|
|
|
| 24 |
return pd.DataFrame(valid_csv_data)
|
| 25 |
|
| 26 |
|
| 27 |
@pytest.fixture
|
| 28 |
def valid_csv_content(valid_input_dataframe):
|
|
|
|
| 29 |
return valid_input_dataframe.to_csv(index=False)
|
test/test_validation.py
CHANGED
|
@@ -9,14 +9,12 @@ class TestValidateCsvCanBeRead:
|
|
| 9 |
"""Test cases for validate_csv_can_be_read function"""
|
| 10 |
|
| 11 |
def test_valid_csv_can_be_read(self, valid_csv_content):
|
| 12 |
-
"""Test that valid CSV content can be read"""
|
| 13 |
df = validate_csv_can_be_read(valid_csv_content)
|
| 14 |
assert isinstance(df, pd.DataFrame)
|
| 15 |
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
| 16 |
assert list(df.columns) == list(REQUIRED_COLUMNS)
|
| 17 |
|
| 18 |
def test_empty_csv_raises_error(self):
|
| 19 |
-
"""Test that empty CSV raises an error"""
|
| 20 |
empty_csv = ""
|
| 21 |
|
| 22 |
with pytest.raises(gr.Error) as exc_info:
|
|
@@ -25,7 +23,6 @@ class TestValidateCsvCanBeRead:
|
|
| 25 |
assert "empty or contains no valid data" in str(exc_info.value)
|
| 26 |
|
| 27 |
def test_invalid_csv_format_raises_error(self):
|
| 28 |
-
"""Test that invalid CSV format raises an error"""
|
| 29 |
# Create a CSV with malformed structure that pandas cannot parse
|
| 30 |
malformed_csv = 'column1,column2\nvalue1,"unclosed quote\nvalue4,value5'
|
| 31 |
|
|
@@ -35,10 +32,9 @@ class TestValidateCsvCanBeRead:
|
|
| 35 |
assert "Invalid CSV format" in str(exc_info.value)
|
| 36 |
|
| 37 |
def test_csv_with_quoted_fields_can_be_read(self):
|
| 38 |
-
"""Test that CSV with quoted fields can be read"""
|
| 39 |
# Create CSV with quoted fields and enough rows
|
| 40 |
-
base_row = '
|
| 41 |
-
csv_content = "
|
| 42 |
csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
|
| 43 |
|
| 44 |
df = validate_csv_can_be_read(csv_content)
|
|
@@ -47,14 +43,10 @@ class TestValidateCsvCanBeRead:
|
|
| 47 |
|
| 48 |
|
| 49 |
class TestValidateDataframe:
|
| 50 |
-
"""Test cases for validate_dataframe function"""
|
| 51 |
-
|
| 52 |
def test_valid_dataframe_passes(self, valid_input_dataframe):
|
| 53 |
-
"""Test that valid DataFrame passes validation"""
|
| 54 |
validate_dataframe(valid_input_dataframe)
|
| 55 |
|
| 56 |
def test_missing_columns_raises_error(self, valid_input_dataframe):
|
| 57 |
-
"""Test that DataFrame with missing columns raises an error"""
|
| 58 |
missing_column = REQUIRED_COLUMNS[0]
|
| 59 |
df = valid_input_dataframe.copy()
|
| 60 |
df.drop(columns=[missing_column], inplace=True)
|
|
@@ -65,7 +57,6 @@ class TestValidateDataframe:
|
|
| 65 |
assert f"Missing required columns: {missing_column}" in str(exc_info.value)
|
| 66 |
|
| 67 |
def test_empty_dataframe_raises_error(self, valid_input_dataframe):
|
| 68 |
-
"""Test that empty DataFrame raises an error"""
|
| 69 |
empty_df = valid_input_dataframe.head(0)
|
| 70 |
|
| 71 |
with pytest.raises(gr.Error) as exc_info:
|
|
@@ -74,7 +65,6 @@ class TestValidateDataframe:
|
|
| 74 |
assert "CSV file is empty" in str(exc_info.value)
|
| 75 |
|
| 76 |
def test_insufficient_rows_raises_error(self, valid_input_dataframe):
|
| 77 |
-
"""Test that DataFrame with insufficient rows raises an error"""
|
| 78 |
df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
|
| 79 |
with pytest.raises(gr.Error) as exc_info:
|
| 80 |
validate_dataframe(df)
|
|
@@ -84,7 +74,6 @@ class TestValidateDataframe:
|
|
| 84 |
)
|
| 85 |
|
| 86 |
def test_missing_values_raises_error(self, valid_input_dataframe):
|
| 87 |
-
"""Test that DataFrame with missing values raises an error"""
|
| 88 |
bad_column = REQUIRED_COLUMNS[0]
|
| 89 |
df = valid_input_dataframe.copy()
|
| 90 |
df[bad_column] = [None] * len(df)
|
|
@@ -94,17 +83,31 @@ class TestValidateDataframe:
|
|
| 94 |
assert f"contains {len(df)} missing values" in str(exc_info.value)
|
| 95 |
|
| 96 |
def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
|
| 97 |
-
"""Test that DataFrame with extra columns passes validation"""
|
| 98 |
extra_column = "extra_column_1"
|
| 99 |
df = valid_input_dataframe.copy()
|
| 100 |
df[extra_column] = ["extra1"] * len(df)
|
| 101 |
df[extra_column] = ["extra2"] * len(df)
|
| 102 |
validate_dataframe(df)
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
class TestValidateCsvFile:
|
| 106 |
-
"""Test cases for the combined validate_csv_file function"""
|
| 107 |
|
|
|
|
| 108 |
def test_valid_csv_passes(self, valid_csv_content):
|
| 109 |
-
"""Test that a valid CSV with all required columns passes validation"""
|
| 110 |
validate_csv_file(valid_csv_content)
|
|
|
|
| 9 |
"""Test cases for validate_csv_can_be_read function"""
|
| 10 |
|
| 11 |
def test_valid_csv_can_be_read(self, valid_csv_content):
|
|
|
|
| 12 |
df = validate_csv_can_be_read(valid_csv_content)
|
| 13 |
assert isinstance(df, pd.DataFrame)
|
| 14 |
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
| 15 |
assert list(df.columns) == list(REQUIRED_COLUMNS)
|
| 16 |
|
| 17 |
def test_empty_csv_raises_error(self):
|
|
|
|
| 18 |
empty_csv = ""
|
| 19 |
|
| 20 |
with pytest.raises(gr.Error) as exc_info:
|
|
|
|
| 23 |
assert "empty or contains no valid data" in str(exc_info.value)
|
| 24 |
|
| 25 |
def test_invalid_csv_format_raises_error(self):
|
|
|
|
| 26 |
# Create a CSV with malformed structure that pandas cannot parse
|
| 27 |
malformed_csv = 'column1,column2\nvalue1,"unclosed quote\nvalue4,value5'
|
| 28 |
|
|
|
|
| 32 |
assert "Invalid CSV format" in str(exc_info.value)
|
| 33 |
|
| 34 |
def test_csv_with_quoted_fields_can_be_read(self):
|
|
|
|
| 35 |
# Create CSV with quoted fields and enough rows
|
| 36 |
+
base_row = 'test_antibody,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
|
| 37 |
+
csv_content = "antibody_name,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
|
| 38 |
csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
|
| 39 |
|
| 40 |
df = validate_csv_can_be_read(csv_content)
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
class TestValidateDataframe:
|
|
|
|
|
|
|
| 46 |
def test_valid_dataframe_passes(self, valid_input_dataframe):
|
|
|
|
| 47 |
validate_dataframe(valid_input_dataframe)
|
| 48 |
|
| 49 |
def test_missing_columns_raises_error(self, valid_input_dataframe):
|
|
|
|
| 50 |
missing_column = REQUIRED_COLUMNS[0]
|
| 51 |
df = valid_input_dataframe.copy()
|
| 52 |
df.drop(columns=[missing_column], inplace=True)
|
|
|
|
| 57 |
assert f"Missing required columns: {missing_column}" in str(exc_info.value)
|
| 58 |
|
| 59 |
def test_empty_dataframe_raises_error(self, valid_input_dataframe):
|
|
|
|
| 60 |
empty_df = valid_input_dataframe.head(0)
|
| 61 |
|
| 62 |
with pytest.raises(gr.Error) as exc_info:
|
|
|
|
| 65 |
assert "CSV file is empty" in str(exc_info.value)
|
| 66 |
|
| 67 |
def test_insufficient_rows_raises_error(self, valid_input_dataframe):
|
|
|
|
| 68 |
df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
|
| 69 |
with pytest.raises(gr.Error) as exc_info:
|
| 70 |
validate_dataframe(df)
|
|
|
|
| 74 |
)
|
| 75 |
|
| 76 |
def test_missing_values_raises_error(self, valid_input_dataframe):
|
|
|
|
| 77 |
bad_column = REQUIRED_COLUMNS[0]
|
| 78 |
df = valid_input_dataframe.copy()
|
| 79 |
df[bad_column] = [None] * len(df)
|
|
|
|
| 83 |
assert f"contains {len(df)} missing values" in str(exc_info.value)
|
| 84 |
|
| 85 |
def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
|
|
|
|
| 86 |
extra_column = "extra_column_1"
|
| 87 |
df = valid_input_dataframe.copy()
|
| 88 |
df[extra_column] = ["extra1"] * len(df)
|
| 89 |
df[extra_column] = ["extra2"] * len(df)
|
| 90 |
validate_dataframe(df)
|
| 91 |
|
| 92 |
+
def test_duplicate_antibody_names_raises_error(self, valid_input_dataframe):
|
| 93 |
+
df = valid_input_dataframe.copy()
|
| 94 |
+
df = pd.concat([df, df.head(1)], ignore_index=True)
|
| 95 |
+
with pytest.raises(gr.Error) as exc_info:
|
| 96 |
+
validate_dataframe(df)
|
| 97 |
+
assert "CSV should have only one row per antibody. Found 1 duplicates." in str(
|
| 98 |
+
exc_info.value
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
def test_unrecognized_antibody_names_raises_error(self, valid_input_dataframe):
|
| 102 |
+
df = valid_input_dataframe.copy()
|
| 103 |
+
df.loc[0, "antibody_name"] = "unrecognized_antibody"
|
| 104 |
+
with pytest.raises(gr.Error) as exc_info:
|
| 105 |
+
validate_dataframe(df)
|
| 106 |
+
assert f"Found unrecognized antibody names: {'unrecognized_antibody'}" in str(
|
| 107 |
+
exc_info.value
|
| 108 |
+
)
|
| 109 |
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
class TestValidateCsvFile:
|
| 112 |
def test_valid_csv_passes(self, valid_csv_content):
|
|
|
|
| 113 |
validate_csv_file(valid_csv_content)
|
validation.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import io
|
| 3 |
import gradio as gr
|
| 4 |
-
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
|
| 5 |
|
| 6 |
|
| 7 |
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
|
|
@@ -61,19 +61,29 @@ def validate_dataframe(df: pd.DataFrame) -> None:
|
|
| 61 |
if df.empty:
|
| 62 |
raise gr.Error("β CSV file is empty")
|
| 63 |
|
| 64 |
-
#
|
| 65 |
for col in REQUIRED_COLUMNS:
|
| 66 |
missing_count = df[col].isnull().sum()
|
| 67 |
if missing_count > 0:
|
| 68 |
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values")
|
| 69 |
|
| 70 |
-
#
|
| 71 |
if len(df) < MINIMAL_NUMBER_OF_ROWS:
|
| 72 |
raise gr.Error(f"β CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
def validate_csv_file(file_content: str) -> None:
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import io
|
| 3 |
import gradio as gr
|
| 4 |
+
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS, ANTIBODY_NAMES
|
| 5 |
|
| 6 |
|
| 7 |
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
|
|
|
|
| 61 |
if df.empty:
|
| 62 |
raise gr.Error("β CSV file is empty")
|
| 63 |
|
| 64 |
+
# No missing values in required columns
|
| 65 |
for col in REQUIRED_COLUMNS:
|
| 66 |
missing_count = df[col].isnull().sum()
|
| 67 |
if missing_count > 0:
|
| 68 |
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values")
|
| 69 |
|
| 70 |
+
# Above minimal number of rows
|
| 71 |
if len(df) < MINIMAL_NUMBER_OF_ROWS:
|
| 72 |
raise gr.Error(f"β CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
|
| 73 |
|
| 74 |
+
# All names should be unique
|
| 75 |
+
n_duplicates = df["antibody_name"].duplicated().sum()
|
| 76 |
+
if n_duplicates > 0:
|
| 77 |
+
raise gr.Error(
|
| 78 |
+
f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates."
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# All antibody names should be recognizable
|
| 82 |
+
unrecognized_antibodies = set(df["antibody_name"]) - set(ANTIBODY_NAMES)
|
| 83 |
+
if unrecognized_antibodies:
|
| 84 |
+
raise gr.Error(
|
| 85 |
+
f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
|
| 86 |
+
)
|
| 87 |
|
| 88 |
|
| 89 |
def validate_csv_file(file_content: str) -> None:
|