File size: 878 Bytes
5f58699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""Loader for the Harvey et al. 2022 dataset."""

from __future__ import annotations

import pandas as pd

from .utils import standardize_frame

_COLUMN_ALIASES = {
    "id": ("id", "clone_id"),
    "heavy_seq": ("heavy", "heavy_chain", "sequence"),
    "light_seq": ("light", "light_chain"),
    "label": ("polyreactive", "is_polyreactive"),
}

_LABEL_MAP = {
    "polyreactive": 1,
    "non-polyreactive": 0,
    "positive": 1,
    "negative": 0,
    1: 1,
    0: 0,
    "1": 1,
    "0": 0,
}


def load_dataframe(path_or_url: str, heavy_only: bool = True) -> pd.DataFrame:
    """Load the Harvey dataset into the canonical format."""

    frame = pd.read_csv(path_or_url)
    return standardize_frame(
        frame,
        source="harvey2022",
        heavy_only=heavy_only,
        column_aliases=_COLUMN_ALIASES,
        label_map=_LABEL_MAP,
        is_test=True,
    )