Spaces:
Sleeping
Sleeping
File size: 6,428 Bytes
5f58699 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
"""ANARCI/ANARCII numbering helpers."""
from __future__ import annotations
from dataclasses import dataclass
from functools import lru_cache
from typing import Dict, List, Sequence, Tuple
try:
from anarcii.pipeline import Anarcii # type: ignore
except ImportError: # pragma: no cover - optional dependency
Anarcii = None
@dataclass(slots=True)
class NumberedResidue:
"""Single residue with IMGT numbering metadata."""
position: int
insertion: str
amino_acid: str
@dataclass(slots=True)
class NumberedSequence:
"""Container for numbering results and derived regions."""
sequence: str
scheme: str
chain_type: str
residues: list[NumberedResidue]
regions: dict[str, str]
_IMGT_HEAVY_REGIONS: Sequence[Tuple[str, int, int]] = (
("FR1", 1, 26),
("CDRH1", 27, 38),
("FR2", 39, 55),
("CDRH2", 56, 65),
("FR3", 66, 104),
("CDRH3", 105, 117),
("FR4", 118, 128),
)
_IMGT_LIGHT_REGIONS: Sequence[Tuple[str, int, int]] = (
("FR1", 1, 26),
("CDRL1", 27, 38),
("FR2", 39, 55),
("CDRL2", 56, 65),
("FR3", 66, 104),
("CDRL3", 105, 117),
("FR4", 118, 128),
)
_REGION_MAP: dict[Tuple[str, str], Sequence[Tuple[str, int, int]]] = {
("imgt", "H"): _IMGT_HEAVY_REGIONS,
("imgt", "L"): _IMGT_LIGHT_REGIONS,
}
_VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")
_DEFAULT_SCHEME = "imgt"
_DEFAULT_CHAIN = "H"
_DEFAULT_NUMBERER: AnarciNumberer | None = None
def _sanitize_sequence(sequence: str) -> str:
return "".join(residue for residue in sequence.upper() if residue in _VALID_AMINO_ACIDS)
def get_default_numberer() -> AnarciNumberer:
global _DEFAULT_NUMBERER
if _DEFAULT_NUMBERER is None:
_DEFAULT_NUMBERER = AnarciNumberer(chain_type=_DEFAULT_CHAIN, cpu=True, ncpu=1, verbose=False)
return _DEFAULT_NUMBERER
def trim_variable_domain(
sequence: str,
*,
numberer: AnarciNumberer | None = None,
scheme: str = _DEFAULT_SCHEME,
chain_type: str = _DEFAULT_CHAIN,
fallback_length: int = 130,
) -> str:
"""Return the FR1–FR4 variable domain for a heavy/light chain sequence."""
cleaned = _sanitize_sequence(sequence)
if not cleaned:
return ""
active_numberer = numberer or get_default_numberer()
try:
numbered = active_numberer.number_sequence(cleaned)
except Exception: # pragma: no cover - best effort safeguard
return cleaned[:fallback_length]
region_sets = _region_boundaries(scheme, chain_type)
pieces: list[str] = []
for region_name, _start, _end in region_sets:
segment = numbered.regions.get(region_name, "")
if segment:
pieces.append(segment)
trimmed = "".join(pieces)
if not trimmed:
trimmed = numbered.regions.get("full", "")
if not trimmed:
trimmed = cleaned[:fallback_length]
return trimmed
def _normalise_chain_type(chain_type: str) -> str:
upper = chain_type.upper()
if upper in {"H", "HV"}:
return "H"
if upper in {"L", "K", "LV", "KV"}:
return "L"
return upper
class AnarciNumberer:
"""Thin wrapper around the ANARCII pipeline to obtain IMGT regions."""
def __init__(
self,
*,
scheme: str = "imgt",
chain_type: str = "H",
cpu: bool = True,
ncpu: int = 1,
verbose: bool = False,
) -> None:
if Anarcii is None: # pragma: no cover - optional dependency guard
msg = (
"anarcii is required for numbering but is not installed."
" Install 'anarcii' to enable ANARCI-based features."
)
raise ImportError(msg)
self.scheme = scheme
self.expected_chain_type = _normalise_chain_type(chain_type)
self.cpu = cpu
self.ncpu = ncpu
self.verbose = verbose
self._runner = None
def _ensure_runner(self) -> Anarcii:
if self._runner is None:
self._runner = Anarcii(
seq_type="antibody",
mode="accuracy",
batch_size=1,
cpu=self.cpu,
ncpu=self.ncpu,
verbose=self.verbose,
)
return self._runner
def number_sequence(self, sequence: str) -> NumberedSequence:
"""Return numbering metadata for a single amino-acid sequence."""
runner = self._ensure_runner()
output = runner.number([sequence])
record = next(iter(output.values()))
if record.get("error"):
raise RuntimeError(f"ANARCI failed: {record['error']}")
scheme = record.get("scheme", self.scheme)
detected_chain = record.get("chain_type", self.expected_chain_type)
normalised_chain = _normalise_chain_type(detected_chain)
if self.expected_chain_type and normalised_chain != self.expected_chain_type:
msg = (
f"Expected chain type {self.expected_chain_type!r} but got"
f" {normalised_chain!r}"
)
raise ValueError(msg)
residues = [
NumberedResidue(position=pos, insertion=ins, amino_acid=aa)
for (pos, ins), aa in record["numbering"]
]
regions = _extract_regions(
residues=residues,
scheme=scheme,
chain_type=normalised_chain,
)
return NumberedSequence(
sequence=sequence,
scheme=scheme,
chain_type=normalised_chain,
residues=residues,
regions=regions,
)
@lru_cache(maxsize=32)
def _region_boundaries(scheme: str, chain_type: str) -> Sequence[Tuple[str, int, int]]:
key = (scheme.lower(), chain_type.upper())
return _REGION_MAP.get(key, ())
def _extract_regions(
*,
residues: Sequence[NumberedResidue],
scheme: str,
chain_type: str,
) -> dict[str, str]:
boundaries = _region_boundaries(scheme, chain_type)
slots: Dict[str, List[str]] = {name: [] for name, _, _ in boundaries}
slots["full"] = []
for residue in residues:
aa = residue.amino_acid
if aa == "-":
continue
slots["full"].append(aa)
for name, start, end in boundaries:
if start <= residue.position <= end:
slots.setdefault(name, []).append(aa)
break
return {key: "".join(value) for key, value in slots.items()}
|