File size: 1,438 Bytes
beae064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import numpy as np
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_text_features(stem, leadin, options):
    text = f"{stem} {leadin}".strip()
    q_char_len = len(text)
    q_word_len = len(text.split())
    q_readability = textstat.flesch_reading_ease(text) if q_word_len > 5 else 0
    q_has_negation = int(any(w in text.lower() for w in ["no", "not", "except", "unless", "never", "least"]))

    # Option-level stats
    opt_avg_len = np.mean([len(str(o).split()) for o in options])
    opt_len_var = np.var([len(str(o).split()) for o in options])

    # Overlap between stem and options
    def jaccard(a, b):
        sa, sb = set(a.lower().split()), set(b.lower().split())
        return len(sa & sb) / len(sa | sb) if sa | sb else 0
    opt_overlap_mean = np.mean([jaccard(text, o) for o in options])

    # --- TF-IDF entropy (8th feature) ---
    tfidf = TfidfVectorizer()
    tfidf.fit(options + [text])
    opt_vecs = tfidf.transform(options)
    sim_matrix = cosine_similarity(opt_vecs)
    np.fill_diagonal(sim_matrix, np.nan)
    avg_sim = np.nanmean(sim_matrix)
    opt_entropy = 1 - avg_sim if not np.isnan(avg_sim) else 0
    # -------------------------------------

    return np.array([[q_char_len, q_word_len, q_readability, q_has_negation,
                      opt_avg_len, opt_len_var, opt_overlap_mean, opt_entropy]])