makiling's picture
Upload folder using huggingface_hub
5f58699 verified
raw
history blame
878 Bytes
"""Loader for the Harvey et al. 2022 dataset."""
from __future__ import annotations
import pandas as pd
from .utils import standardize_frame
_COLUMN_ALIASES = {
"id": ("id", "clone_id"),
"heavy_seq": ("heavy", "heavy_chain", "sequence"),
"light_seq": ("light", "light_chain"),
"label": ("polyreactive", "is_polyreactive"),
}
_LABEL_MAP = {
"polyreactive": 1,
"non-polyreactive": 0,
"positive": 1,
"negative": 0,
1: 1,
0: 0,
"1": 1,
"0": 0,
}
def load_dataframe(path_or_url: str, heavy_only: bool = True) -> pd.DataFrame:
"""Load the Harvey dataset into the canonical format."""
frame = pd.read_csv(path_or_url)
return standardize_frame(
frame,
source="harvey2022",
heavy_only=heavy_only,
column_aliases=_COLUMN_ALIASES,
label_map=_LABEL_MAP,
is_test=True,
)