validate dataframe with tests
Browse files- submit.py +14 -2
- test/__init__.py +0 -0
- test/conftest.py +26 -0
- test/test_validation.py +109 -0
- validation.py +101 -0
submit.py
CHANGED
|
@@ -2,12 +2,15 @@ from pathlib import Path
|
|
| 2 |
import tempfile
|
| 3 |
from typing import BinaryIO
|
| 4 |
import json
|
|
|
|
|
|
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
from datetime import datetime
|
| 8 |
import uuid
|
| 9 |
|
| 10 |
-
from
|
|
|
|
| 11 |
|
| 12 |
def make_submission(
|
| 13 |
submitted_file: BinaryIO,
|
|
@@ -17,17 +20,26 @@ def make_submission(
|
|
| 17 |
if user_state is None:
|
| 18 |
raise gr.Error("You must submit your username to submit a file.")
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
file_path = submitted_file.name
|
| 21 |
|
| 22 |
if not file_path:
|
| 23 |
raise gr.Error("Uploaded file object does not have a valid file path.")
|
| 24 |
|
| 25 |
path_obj = Path(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
timestamp = datetime.utcnow().isoformat()
|
| 27 |
submission_id = str(uuid.uuid4())
|
| 28 |
|
| 29 |
with (path_obj.open("rb") as f_in):
|
| 30 |
file_content = f_in.read().decode("utf-8")
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# write to dataset
|
| 33 |
filename = f"{submission_id}.json"
|
|
@@ -49,7 +61,7 @@ def make_submission(
|
|
| 49 |
API.upload_file(
|
| 50 |
path_or_fileobj=tmp_name,
|
| 51 |
path_in_repo=filename,
|
| 52 |
-
repo_id=
|
| 53 |
repo_type="dataset",
|
| 54 |
commit_message=f"Add submission for {user_state} at {timestamp}"
|
| 55 |
)
|
|
|
|
| 2 |
import tempfile
|
| 3 |
from typing import BinaryIO
|
| 4 |
import json
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import io
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
from datetime import datetime
|
| 10 |
import uuid
|
| 11 |
|
| 12 |
+
from constants import API, SUBMISSIONS_REPO
|
| 13 |
+
from validation import validate_csv_file
|
| 14 |
|
| 15 |
def make_submission(
|
| 16 |
submitted_file: BinaryIO,
|
|
|
|
| 20 |
if user_state is None:
|
| 21 |
raise gr.Error("You must submit your username to submit a file.")
|
| 22 |
|
| 23 |
+
if submitted_file is None:
|
| 24 |
+
raise gr.Error("Please upload a CSV file before submitting.")
|
| 25 |
+
|
| 26 |
file_path = submitted_file.name
|
| 27 |
|
| 28 |
if not file_path:
|
| 29 |
raise gr.Error("Uploaded file object does not have a valid file path.")
|
| 30 |
|
| 31 |
path_obj = Path(file_path)
|
| 32 |
+
|
| 33 |
+
if path_obj.suffix.lower() != '.csv':
|
| 34 |
+
raise gr.Error("File must be a CSV file. Please upload a .csv file.")
|
| 35 |
+
|
| 36 |
timestamp = datetime.utcnow().isoformat()
|
| 37 |
submission_id = str(uuid.uuid4())
|
| 38 |
|
| 39 |
with (path_obj.open("rb") as f_in):
|
| 40 |
file_content = f_in.read().decode("utf-8")
|
| 41 |
+
|
| 42 |
+
validate_csv_file(file_content)
|
| 43 |
|
| 44 |
# write to dataset
|
| 45 |
filename = f"{submission_id}.json"
|
|
|
|
| 61 |
API.upload_file(
|
| 62 |
path_or_fileobj=tmp_name,
|
| 63 |
path_in_repo=filename,
|
| 64 |
+
repo_id=SUBMISSIONS_REPO,
|
| 65 |
repo_type="dataset",
|
| 66 |
commit_message=f"Add submission for {user_state} at {timestamp}"
|
| 67 |
)
|
test/__init__.py
ADDED
|
File without changes
|
test/conftest.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from constants import MINIMAL_NUMBER_OF_ROWS, ASSAY_LIST
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@pytest.fixture
|
| 7 |
+
def valid_csv_data():
|
| 8 |
+
"""Fixture providing valid CSV data with all required columns"""
|
| 9 |
+
return {
|
| 10 |
+
"antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
|
| 11 |
+
"antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
|
| 12 |
+
"vh_protein_sequence": ["EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"] * MINIMAL_NUMBER_OF_ROWS,
|
| 13 |
+
"vl_protein_sequence": ["DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK"] * MINIMAL_NUMBER_OF_ROWS,
|
| 14 |
+
**{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@pytest.fixture
|
| 19 |
+
def valid_input_dataframe(valid_csv_data):
|
| 20 |
+
"""Fixture providing a valid input dataframe"""
|
| 21 |
+
return pd.DataFrame(valid_csv_data)
|
| 22 |
+
|
| 23 |
+
@pytest.fixture
|
| 24 |
+
def valid_csv_content(valid_input_dataframe):
|
| 25 |
+
"""Fixture providing valid CSV content as string"""
|
| 26 |
+
return valid_input_dataframe.to_csv(index=False)
|
test/test_validation.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from validation import validate_csv_file, validate_csv_can_be_read, validate_dataframe
|
| 5 |
+
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TestValidateCsvCanBeRead:
|
| 9 |
+
"""Test cases for validate_csv_can_be_read function"""
|
| 10 |
+
|
| 11 |
+
def test_valid_csv_can_be_read(self, valid_csv_content):
|
| 12 |
+
"""Test that valid CSV content can be read"""
|
| 13 |
+
df = validate_csv_can_be_read(valid_csv_content)
|
| 14 |
+
assert isinstance(df, pd.DataFrame)
|
| 15 |
+
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
| 16 |
+
assert list(df.columns) == list(REQUIRED_COLUMNS)
|
| 17 |
+
|
| 18 |
+
def test_empty_csv_raises_error(self):
|
| 19 |
+
"""Test that empty CSV raises an error"""
|
| 20 |
+
empty_csv = ""
|
| 21 |
+
|
| 22 |
+
with pytest.raises(gr.Error) as exc_info:
|
| 23 |
+
validate_csv_can_be_read(empty_csv)
|
| 24 |
+
|
| 25 |
+
assert "empty or contains no valid data" in str(exc_info.value)
|
| 26 |
+
|
| 27 |
+
def test_invalid_csv_format_raises_error(self):
|
| 28 |
+
"""Test that invalid CSV format raises an error"""
|
| 29 |
+
# Create a CSV with malformed structure that pandas cannot parse
|
| 30 |
+
malformed_csv = "column1,column2\nvalue1,\"unclosed quote\nvalue4,value5"
|
| 31 |
+
|
| 32 |
+
with pytest.raises(gr.Error) as exc_info:
|
| 33 |
+
validate_csv_can_be_read(malformed_csv)
|
| 34 |
+
|
| 35 |
+
assert "Invalid CSV format" in str(exc_info.value)
|
| 36 |
+
|
| 37 |
+
def test_csv_with_quoted_fields_can_be_read(self):
|
| 38 |
+
"""Test that CSV with quoted fields can be read"""
|
| 39 |
+
# Create CSV with quoted fields and enough rows
|
| 40 |
+
base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
|
| 41 |
+
csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
|
| 42 |
+
csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
|
| 43 |
+
|
| 44 |
+
df = validate_csv_can_be_read(csv_content)
|
| 45 |
+
assert isinstance(df, pd.DataFrame)
|
| 46 |
+
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class TestValidateDataframe:
|
| 50 |
+
"""Test cases for validate_dataframe function"""
|
| 51 |
+
|
| 52 |
+
def test_valid_dataframe_passes(self, valid_input_dataframe):
|
| 53 |
+
"""Test that valid DataFrame passes validation"""
|
| 54 |
+
validate_dataframe(valid_input_dataframe)
|
| 55 |
+
|
| 56 |
+
def test_missing_columns_raises_error(self, valid_input_dataframe):
|
| 57 |
+
"""Test that DataFrame with missing columns raises an error"""
|
| 58 |
+
missing_column = REQUIRED_COLUMNS[0]
|
| 59 |
+
df = valid_input_dataframe.copy()
|
| 60 |
+
df.drop(columns=[missing_column], inplace=True)
|
| 61 |
+
|
| 62 |
+
with pytest.raises(gr.Error) as exc_info:
|
| 63 |
+
validate_dataframe(df)
|
| 64 |
+
|
| 65 |
+
assert f"Missing required columns: {missing_column}" in str(exc_info.value)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def test_empty_dataframe_raises_error(self, valid_input_dataframe):
|
| 69 |
+
"""Test that empty DataFrame raises an error"""
|
| 70 |
+
empty_df = valid_input_dataframe.head(0)
|
| 71 |
+
|
| 72 |
+
with pytest.raises(gr.Error) as exc_info:
|
| 73 |
+
validate_dataframe(empty_df)
|
| 74 |
+
|
| 75 |
+
assert "CSV file is empty" in str(exc_info.value)
|
| 76 |
+
|
| 77 |
+
def test_insufficient_rows_raises_error(self, valid_input_dataframe):
|
| 78 |
+
"""Test that DataFrame with insufficient rows raises an error"""
|
| 79 |
+
df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
|
| 80 |
+
with pytest.raises(gr.Error) as exc_info:
|
| 81 |
+
validate_dataframe(df)
|
| 82 |
+
|
| 83 |
+
assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(exc_info.value)
|
| 84 |
+
|
| 85 |
+
def test_missing_values_raises_error(self, valid_input_dataframe):
|
| 86 |
+
"""Test that DataFrame with missing values raises an error"""
|
| 87 |
+
bad_column = REQUIRED_COLUMNS[0]
|
| 88 |
+
df = valid_input_dataframe.copy()
|
| 89 |
+
df[bad_column] = [None] * len(df)
|
| 90 |
+
with pytest.raises(gr.Error) as exc_info:
|
| 91 |
+
validate_dataframe(df)
|
| 92 |
+
|
| 93 |
+
assert f"contains {len(df)} missing values" in str(exc_info.value)
|
| 94 |
+
|
| 95 |
+
def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
|
| 96 |
+
"""Test that DataFrame with extra columns passes validation"""
|
| 97 |
+
extra_column = "extra_column_1"
|
| 98 |
+
df = valid_input_dataframe.copy()
|
| 99 |
+
df[extra_column] = ["extra1"] * len(df)
|
| 100 |
+
df[extra_column] = ["extra2"] * len(df)
|
| 101 |
+
validate_dataframe(df)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class TestValidateCsvFile:
|
| 105 |
+
"""Test cases for the combined validate_csv_file function"""
|
| 106 |
+
|
| 107 |
+
def test_valid_csv_passes(self, valid_csv_content):
|
| 108 |
+
"""Test that a valid CSV with all required columns passes validation"""
|
| 109 |
+
validate_csv_file(valid_csv_content)
|
validation.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import io
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
|
| 5 |
+
|
| 6 |
+
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
|
| 7 |
+
"""
|
| 8 |
+
Validate that the CSV file can be read and parsed.
|
| 9 |
+
|
| 10 |
+
Parameters
|
| 11 |
+
----------
|
| 12 |
+
file_content: str
|
| 13 |
+
The content of the uploaded CSV file.
|
| 14 |
+
|
| 15 |
+
Returns
|
| 16 |
+
-------
|
| 17 |
+
pd.DataFrame
|
| 18 |
+
The parsed DataFrame if successful.
|
| 19 |
+
|
| 20 |
+
Raises
|
| 21 |
+
------
|
| 22 |
+
gr.Error: If CSV cannot be read or parsed
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
# Read CSV content
|
| 26 |
+
df = pd.read_csv(io.StringIO(file_content))
|
| 27 |
+
return df
|
| 28 |
+
|
| 29 |
+
except pd.errors.EmptyDataError:
|
| 30 |
+
raise gr.Error(
|
| 31 |
+
"β CSV file is empty or contains no valid data"
|
| 32 |
+
)
|
| 33 |
+
except pd.errors.ParserError as e:
|
| 34 |
+
raise gr.Error(
|
| 35 |
+
f"β Invalid CSV format<br><br>"
|
| 36 |
+
f"Error: {str(e)}"
|
| 37 |
+
)
|
| 38 |
+
except UnicodeDecodeError:
|
| 39 |
+
raise gr.Error(
|
| 40 |
+
"β File encoding error<br><br>"
|
| 41 |
+
"Your file appears to have an unsupported encoding.<br>"
|
| 42 |
+
"Please save your CSV file with UTF-8 encoding and try again."
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def validate_dataframe(df: pd.DataFrame) -> None:
|
| 46 |
+
"""
|
| 47 |
+
Validate the DataFrame content and structure.
|
| 48 |
+
|
| 49 |
+
Parameters
|
| 50 |
+
----------
|
| 51 |
+
df: pd.DataFrame
|
| 52 |
+
The DataFrame to validate.
|
| 53 |
+
|
| 54 |
+
Raises
|
| 55 |
+
------
|
| 56 |
+
gr.Error: If validation fails
|
| 57 |
+
"""
|
| 58 |
+
# Required columns should be present
|
| 59 |
+
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
|
| 60 |
+
if missing_columns:
|
| 61 |
+
raise gr.Error(
|
| 62 |
+
f"β Missing required columns: {', '.join(missing_columns)}"
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Data should not be empty
|
| 66 |
+
if df.empty:
|
| 67 |
+
raise gr.Error(
|
| 68 |
+
"β CSV file is empty"
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Check for missing values in required columns
|
| 72 |
+
for col in REQUIRED_COLUMNS:
|
| 73 |
+
missing_count = df[col].isnull().sum()
|
| 74 |
+
if missing_count > 0:
|
| 75 |
+
raise gr.Error(
|
| 76 |
+
f"β Column '{col}' contains {missing_count} missing values"
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Check for reasonable number of rows
|
| 80 |
+
if len(df) < MINIMAL_NUMBER_OF_ROWS:
|
| 81 |
+
raise gr.Error(
|
| 82 |
+
f"β CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
print(f"β
CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}")
|
| 86 |
+
|
| 87 |
+
def validate_csv_file(file_content: str) -> None:
|
| 88 |
+
"""
|
| 89 |
+
Validate the uploaded CSV file.
|
| 90 |
+
|
| 91 |
+
Parameters
|
| 92 |
+
----------
|
| 93 |
+
file_content: str
|
| 94 |
+
The content of the uploaded CSV file.
|
| 95 |
+
|
| 96 |
+
Raises
|
| 97 |
+
------
|
| 98 |
+
gr.Error: If validation fails
|
| 99 |
+
"""
|
| 100 |
+
df = validate_csv_can_be_read(file_content)
|
| 101 |
+
validate_dataframe(df)
|