import numpy as np import textstat from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def compute_text_features(stem, leadin, options): text = f"{stem} {leadin}".strip() q_char_len = len(text) q_word_len = len(text.split()) q_readability = textstat.flesch_reading_ease(text) if q_word_len > 5 else 0 q_has_negation = int(any(w in text.lower() for w in ["no", "not", "except", "unless", "never", "least"])) # Option-level stats opt_avg_len = np.mean([len(str(o).split()) for o in options]) opt_len_var = np.var([len(str(o).split()) for o in options]) # Overlap between stem and options def jaccard(a, b): sa, sb = set(a.lower().split()), set(b.lower().split()) return len(sa & sb) / len(sa | sb) if sa | sb else 0 opt_overlap_mean = np.mean([jaccard(text, o) for o in options]) # --- TF-IDF entropy (8th feature) --- tfidf = TfidfVectorizer() tfidf.fit(options + [text]) opt_vecs = tfidf.transform(options) sim_matrix = cosine_similarity(opt_vecs) np.fill_diagonal(sim_matrix, np.nan) avg_sim = np.nanmean(sim_matrix) opt_entropy = 1 - avg_sim if not np.isnan(avg_sim) else 0 # ------------------------------------- return np.array([[q_char_len, q_word_len, q_readability, q_has_negation, opt_avg_len, opt_len_var, opt_overlap_mean, opt_entropy]])