Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import textstat | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| def compute_text_features(stem, leadin, options): | |
| text = f"{stem} {leadin}".strip() | |
| q_char_len = len(text) | |
| q_word_len = len(text.split()) | |
| q_readability = textstat.flesch_reading_ease(text) if q_word_len > 5 else 0 | |
| q_has_negation = int(any(w in text.lower() for w in ["no", "not", "except", "unless", "never", "least"])) | |
| # Option-level stats | |
| opt_avg_len = np.mean([len(str(o).split()) for o in options]) | |
| opt_len_var = np.var([len(str(o).split()) for o in options]) | |
| # Overlap between stem and options | |
| def jaccard(a, b): | |
| sa, sb = set(a.lower().split()), set(b.lower().split()) | |
| return len(sa & sb) / len(sa | sb) if sa | sb else 0 | |
| opt_overlap_mean = np.mean([jaccard(text, o) for o in options]) | |
| # --- TF-IDF entropy (8th feature) --- | |
| tfidf = TfidfVectorizer() | |
| tfidf.fit(options + [text]) | |
| opt_vecs = tfidf.transform(options) | |
| sim_matrix = cosine_similarity(opt_vecs) | |
| np.fill_diagonal(sim_matrix, np.nan) | |
| avg_sim = np.nanmean(sim_matrix) | |
| opt_entropy = 1 - avg_sim if not np.isnan(avg_sim) else 0 | |
| # ------------------------------------- | |
| return np.array([[q_char_len, q_word_len, q_readability, q_has_negation, | |
| opt_avg_len, opt_len_var, opt_overlap_mean, opt_entropy]]) | |