bhardwaj08sarthak commited on
Commit
2ec4936
·
verified ·
1 Parent(s): 3d115e7

Delete level_classifier_tool.py

Browse files
Files changed (1) hide show
  1. level_classifier_tool.py +0 -237
level_classifier_tool.py DELETED
@@ -1,237 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from dataclasses import dataclass
4
- from typing import Dict, List, Tuple, Iterable, Optional, Literal, Callable, Any
5
- import math
6
- import os
7
-
8
- # Optional heavy deps are imported lazily when needed
9
- _TOK = None
10
- _MODEL = None
11
- _TORCH = None
12
-
13
- Agg = Literal["mean", "max", "topk_mean"]
14
-
15
-
16
- # --------------------------- Embedding backend ---------------------------
17
-
18
- @dataclass
19
- class HFEmbeddingBackend:
20
- """
21
- Minimal huggingface transformers encoder for sentence-level embeddings.
22
- Uses mean pooling over last_hidden_state and L2 normalizes the result.
23
- """
24
- model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
25
- # "cuda" | "cpu" | None -> (env or "cpu")
26
- # We default to CPU on Spaces to avoid ZeroGPU device mixups.
27
- device: Optional[str] = None
28
-
29
- def _lazy_import(self) -> None:
30
- global _TOK, _MODEL, _TORCH
31
- if _TORCH is None:
32
- import torch as _torch
33
- _TORCH = _torch
34
- if _TOK is None or _MODEL is None:
35
- from transformers import AutoTokenizer, AutoModel # type: ignore
36
- _TOK = AutoTokenizer.from_pretrained(self.model_name)
37
- _MODEL = AutoModel.from_pretrained(self.model_name)
38
- # Prefer explicit device -> env override -> default to CPU
39
- dev = self.device or os.getenv("EMBEDDING_DEVICE") or "cpu"
40
- _MODEL.to(dev).eval()
41
- self.device = dev
42
-
43
- def encode(self, texts: Iterable[str], batch_size: int = 32) -> "tuple[_TORCH.Tensor, list[str]]":
44
- """
45
- Returns (embeddings, texts_list). Embeddings are a CPU torch.Tensor [N, D], unit-normalized.
46
- """
47
- self._lazy_import()
48
- torch = _TORCH # local alias
49
- texts_list = list(texts)
50
- if not texts_list:
51
- # Hidden size available after _lazy_import
52
- return torch.empty((0, _MODEL.config.hidden_size)), [] # type: ignore
53
-
54
- all_out = []
55
- with torch.inference_mode():
56
- for i in range(0, len(texts_list), batch_size):
57
- batch = texts_list[i:i + batch_size]
58
- # Tokenize and move to model device
59
- enc = _TOK(batch, padding=True, truncation=True, return_tensors="pt").to(self.device) # type: ignore
60
- out = _MODEL(**enc)
61
- last = out.last_hidden_state # [B, T, H]
62
- mask = enc["attention_mask"].unsqueeze(-1) # [B, T, 1]
63
- # mean pool
64
- summed = (last * mask).sum(dim=1)
65
- counts = mask.sum(dim=1).clamp(min=1)
66
- pooled = summed / counts
67
- # L2 normalize
68
- pooled = pooled / pooled.norm(dim=1, keepdim=True).clamp(min=1e-12)
69
- # Collect on CPU for downstream ops
70
- all_out.append(pooled.cpu())
71
-
72
- embs = torch.cat(all_out, dim=0) if all_out else torch.empty((0, _MODEL.config.hidden_size)) # type: ignore
73
- return embs, texts_list
74
-
75
-
76
- # --------------------------- Utilities ---------------------------
77
-
78
- def _normalize_whitespace(s: str) -> str:
79
- return " ".join(s.strip().split())
80
-
81
-
82
- def _default_preprocess(s: str) -> str:
83
- # Keep simple, deterministic preprocessing. Users can override with a custom callable.
84
- return _normalize_whitespace(s)
85
-
86
-
87
- @dataclass
88
- class PhraseIndex:
89
- phrases_by_level: Dict[str, List[str]]
90
- embeddings_by_level: Dict[str, "Any"]
91
- model_name: str
92
-
93
-
94
- def build_phrase_index(
95
- backend: HFEmbeddingBackend,
96
- phrases_by_level: Dict[str, Iterable[str]],
97
- ) -> PhraseIndex:
98
- """
99
- Pre-encode all anchor phrases per level into a searchable index.
100
- """
101
- # Flatten texts while preserving level boundaries
102
- cleaned: Dict[str, List[str]] = {lvl: [_default_preprocess(p) for p in phrases] for lvl, phrases in phrases_by_level.items()}
103
- all_texts: List[str] = []
104
- spans: List[Tuple[str, int, int]] = [] # (level, start, end) in the flat list
105
- cur = 0
106
- for lvl, plist in cleaned.items():
107
- start = cur
108
- all_texts.extend(plist)
109
- cur += len(plist)
110
- spans.append((lvl, start, cur))
111
-
112
- embs, _ = backend.encode(all_texts) # embs is a CPU torch.Tensor [N, D]
113
-
114
- # Slice embeddings back into level buckets
115
- torch = _TORCH
116
- embeddings_by_level: Dict[str, "Any"] = {}
117
- for lvl, start, end in spans:
118
- if end > start:
119
- embeddings_by_level[lvl] = embs[start:end] # torch.Tensor slice [n_i, D]
120
- else:
121
- embeddings_by_level[lvl] = torch.empty((0, embs.shape[1])) # type: ignore
122
-
123
- return PhraseIndex(
124
- phrases_by_level={lvl: list(pl) for lvl, pl in cleaned.items()},
125
- embeddings_by_level=embeddings_by_level,
126
- model_name=backend.model_name
127
- )
128
-
129
-
130
- def _aggregate_sims(
131
- sims: "Any", agg: Agg, topk: int
132
- ) -> float:
133
- """
134
- Aggregate a 1D tensor of similarities into a single score.
135
- """
136
- torch = _TORCH
137
- if sims.numel() == 0:
138
- return float("nan")
139
- if agg == "mean":
140
- return float(sims.mean().item())
141
- if agg == "max":
142
- return float(sims.max().item())
143
- if agg == "topk_mean":
144
- k = min(topk, sims.numel())
145
- topk_vals, _ = torch.topk(sims, k)
146
- return float(topk_vals.mean().item())
147
- raise ValueError(f"Unknown agg: {agg}")
148
-
149
-
150
- # --------------------------- Public API ---------------------------
151
-
152
- def classify_levels_phrases(
153
- question: str,
154
- blooms_phrases: Dict[str, Iterable[str]],
155
- dok_phrases: Dict[str, Iterable[str]],
156
- *,
157
- model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
158
- agg: Agg = "max",
159
- topk: int = 5,
160
- preprocess: Optional[Callable[[str], str]] = None,
161
- backend: Optional[HFEmbeddingBackend] = None,
162
- prebuilt_bloom_index: Optional[PhraseIndex] = None,
163
- prebuilt_dok_index: Optional[PhraseIndex] = None,
164
- return_phrase_matches: bool = True,
165
- ) -> Dict[str, Any]:
166
- """
167
- Score a question against Bloom's taxonomy and DOK (Depth of Knowledge)
168
- using cosine similarity to level-specific anchor phrases.
169
- """
170
- preprocess = preprocess or _default_preprocess
171
- question_clean = preprocess(question)
172
-
173
- # Prepare backend (defaults to CPU)
174
- be = backend or HFEmbeddingBackend(model_name=model_name)
175
-
176
- # Build / reuse indices
177
- bloom_index = prebuilt_bloom_index or build_phrase_index(be, blooms_phrases)
178
- dok_index = prebuilt_dok_index or build_phrase_index(be, dok_phrases)
179
-
180
- # Encode question -> CPU torch.Tensor [1, D]
181
- q_emb, _ = be.encode([question_clean])
182
- q_emb = q_emb[0:1]
183
- torch = _TORCH
184
-
185
- def _score_block(index: PhraseIndex) -> Tuple[Dict[str, float], Dict[str, List[Tuple[str, float]]]]:
186
- scores: Dict[str, float] = {}
187
- top_contribs: Dict[str, List[Tuple[str, float]]] = {}
188
-
189
- for lvl, phrases in index.phrases_by_level.items():
190
- embs = index.embeddings_by_level[lvl] # torch.Tensor [N, D]
191
- if embs.numel() == 0:
192
- scores[lvl] = float("nan")
193
- top_contribs[lvl] = []
194
- continue
195
- # cosine similarity since embs and q_emb are unit-normalized
196
- sims = (q_emb @ embs.T).squeeze(0)
197
- scores[lvl] = _aggregate_sims(sims, agg, topk)
198
- if return_phrase_matches:
199
- k = min(5, sims.numel())
200
- vals, idxs = torch.topk(sims, k)
201
- top_contribs[lvl] = [(phrases[int(i)], float(v.item())) for v, i in zip(vals, idxs)]
202
- return scores, top_contribs
203
-
204
- bloom_scores, bloom_top = _score_block(bloom_index)
205
- dok_scores, dok_top = _score_block(dok_index)
206
-
207
- def _best(scores: Dict[str, float]) -> Tuple[str, float]:
208
- # max with NaN-safe handling
209
- best_lvl, best_val = None, -float("inf")
210
- for lvl, val in scores.items():
211
- if isinstance(val, float) and (not math.isnan(val)) and val > best_val:
212
- best_lvl, best_val = lvl, val
213
- return best_lvl or "", best_val
214
-
215
- best_bloom, best_bloom_val = _best(bloom_scores)
216
- best_dok, best_dok_val = _best(dok_scores)
217
-
218
- return {
219
- "question": question_clean,
220
- "model_name": be.model_name,
221
- "blooms": {
222
- "scores": bloom_scores,
223
- "best_level": best_bloom,
224
- "best_score": best_bloom_val,
225
- "top_phrases": bloom_top if return_phrase_matches else None,
226
- },
227
- "dok": {
228
- "scores": dok_scores,
229
- "best_level": best_dok,
230
- "best_score": best_dok_val,
231
- "top_phrases": dok_top if return_phrase_matches else None,
232
- },
233
- "config": {
234
- "agg": agg,
235
- "topk": topk if agg == "topk_mean" else None,
236
- },
237
- }