sqb-predict-api / utils /preprocess.py
Ahmad Hathim bin Ahmad Azman
Add FastAPI backend for SQB prediction
beae064
raw
history blame
1.44 kB
import numpy as np
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def compute_text_features(stem, leadin, options):
text = f"{stem} {leadin}".strip()
q_char_len = len(text)
q_word_len = len(text.split())
q_readability = textstat.flesch_reading_ease(text) if q_word_len > 5 else 0
q_has_negation = int(any(w in text.lower() for w in ["no", "not", "except", "unless", "never", "least"]))
# Option-level stats
opt_avg_len = np.mean([len(str(o).split()) for o in options])
opt_len_var = np.var([len(str(o).split()) for o in options])
# Overlap between stem and options
def jaccard(a, b):
sa, sb = set(a.lower().split()), set(b.lower().split())
return len(sa & sb) / len(sa | sb) if sa | sb else 0
opt_overlap_mean = np.mean([jaccard(text, o) for o in options])
# --- TF-IDF entropy (8th feature) ---
tfidf = TfidfVectorizer()
tfidf.fit(options + [text])
opt_vecs = tfidf.transform(options)
sim_matrix = cosine_similarity(opt_vecs)
np.fill_diagonal(sim_matrix, np.nan)
avg_sim = np.nanmean(sim_matrix)
opt_entropy = 1 - avg_sim if not np.isnan(avg_sim) else 0
# -------------------------------------
return np.array([[q_char_len, q_word_len, q_readability, q_has_negation,
opt_avg_len, opt_len_var, opt_overlap_mean, opt_entropy]])