{ "title": "K-Means Clustering Mastery: 100 MCQs", "description": "A comprehensive set of 100 multiple-choice questions designed to test and deepen your understanding of K-Means Clustering, covering basic concepts, algorithm steps, practical scenarios, and challenges in high-dimensional or real-world datasets.", "questions": [ { "id": 1, "questionText": "What is the primary goal of K-Means Clustering?", "options": [ "Partition data into K clusters minimizing within-cluster variance", "Reduce the dimensionality of the dataset", "Detect outliers in the dataset", "Classify data into predefined categories" ], "correctAnswerIndex": 0, "explanation": "K-Means aims to divide data into K clusters such that the sum of squared distances between points and their cluster centroid is minimized." }, { "id": 2, "questionText": "In K-Means, what does a 'centroid' represent?", "options": [ "A random point from the dataset", "The farthest point from the cluster", "The maximum value in the cluster", "The mean position of all points in the cluster" ], "correctAnswerIndex": 3, "explanation": "Centroid is the mean of all points in a cluster and represents the cluster's center." }, { "id": 3, "questionText": "Which step is repeated in K-Means until convergence?", "options": [ "Compute correlation matrix", "Assign points to nearest centroid and update centroids", "Remove outliers", "Randomly shuffle data points" ], "correctAnswerIndex": 1, "explanation": "K-Means iteratively assigns points to the nearest centroid and recalculates centroids until assignments stabilize." }, { "id": 4, "questionText": "Scenario: K-Means converges but clusters are uneven in size. Likely reason?", "options": [ "Centroids are incorrect", "Distance metric used is Euclidean", "Data distribution is skewed", "Algorithm failed" ], "correctAnswerIndex": 2, "explanation": "K-Means partitions based on distances; skewed or non-spherical distributions can lead to uneven cluster sizes." }, { "id": 5, "questionText": "What is the main limitation of K-Means clustering?", "options": [ "Sensitive to outliers", "Requires predefined number of clusters (K)", "Only works for numerical data", "All of the above" ], "correctAnswerIndex": 3, "explanation": "K-Means works only with numerical data, needs K as input, and is sensitive to outliers." }, { "id": 6, "questionText": "Scenario: K-Means applied to customer locations. Distance metric to use?", "options": [ "Euclidean distance", "Hamming distance", "Cosine similarity", "Jaccard index" ], "correctAnswerIndex": 0, "explanation": "Euclidean distance is standard for K-Means and spatial numerical data." }, { "id": 7, "questionText": "Scenario: K-Means on 2D points results vary with different initial centroids. Solution?", "options": [ "Use hierarchical clustering instead", "Ignore initial centroids", "Reduce K", "Use K-Means++ initialization" ], "correctAnswerIndex": 3, "explanation": "K-Means++ selects better initial centroids to improve convergence and consistency." }, { "id": 8, "questionText": "Scenario: K-Means on concentric circles fails. Reason?", "options": [ "K-Means assumes spherical clusters", "Data contains outliers", "Distance metric wrong", "Algorithm converged too quickly" ], "correctAnswerIndex": 0, "explanation": "K-Means works best for convex, spherical clusters; it cannot separate concentric circular clusters." }, { "id": 9, "questionText": "Scenario: After K-Means clustering, silhouette score is low. Interpretation?", "options": [ "Noise ignored automatically", "Clusters overlap or poorly defined", "Clusters are perfect", "Algorithm converged correctly" ], "correctAnswerIndex": 1, "explanation": "Low silhouette score indicates points are close to neighboring cluster centroids; clusters are not well separated." }, { "id": 10, "questionText": "Scenario: Large dataset with millions of points. K-Means limitation?", "options": [ "Algorithm fails completely", "Distance metric is irrelevant", "Cannot calculate centroids", "Convergence can be slow; consider Mini-Batch K-Means" ], "correctAnswerIndex": 3, "explanation": "Mini-Batch K-Means is a faster variant suitable for large datasets." }, { "id": 11, "questionText": "Scenario: K-Means on a dataset with outliers. Effect?", "options": [ "Algorithm removes outliers", "Centroids can shift towards outliers, distorting clusters", "Clusters become more compact", "Clusters ignore outliers automatically" ], "correctAnswerIndex": 1, "explanation": "Outliers can disproportionately affect centroids, leading to poorly defined clusters." }, { "id": 12, "questionText": "Scenario: K-Means on categorical data. Limitation?", "options": [ "K-Means requires numerical data; cannot handle categorical directly", "Clusters merge randomly", "Categorical data improves clustering", "Algorithm automatically encodes categories" ], "correctAnswerIndex": 0, "explanation": "K-Means relies on distance metrics, which are not directly defined for categorical data." }, { "id": 13, "questionText": "Scenario: Selecting K for K-Means. Which method helps?", "options": [ "Merge dendrograms", "Random selection", "Elbow method", "Silhouette ignored" ], "correctAnswerIndex": 2, "explanation": "The Elbow method plots sum of squared errors vs K and identifies an 'elbow' point as optimal K." }, { "id": 14, "questionText": "Scenario: K-Means fails to separate overlapping clusters. Likely reason?", "options": [ "K too small", "Centroids are optimal", "Algorithm converged correctly", "Clusters are not well-separated or non-convex" ], "correctAnswerIndex": 3, "explanation": "K-Means works best for well-separated convex clusters; overlapping clusters are challenging." }, { "id": 15, "questionText": "Scenario: K-Means with high-dimensional data. Challenge?", "options": [ "Algorithm fails automatically", "Noise ignored", "Distance metrics lose meaning; consider PCA or feature scaling", "Clusters are always compact" ], "correctAnswerIndex": 2, "explanation": "High-dimensional spaces dilute distances; dimensionality reduction improves clustering." }, { "id": 16, "questionText": "Scenario: K-Means with K too large. Effect?", "options": [ "Algorithm automatically reduces K", "Clusters always improve", "Clusters may become small and meaningless", "Noise ignored" ], "correctAnswerIndex": 2, "explanation": "Choosing K too large can lead to many tiny clusters with no meaningful pattern." }, { "id": 17, "questionText": "Scenario: Mini-Batch K-Means. Advantage?", "options": [ "Faster for large datasets with approximate centroids", "Removes noise automatically", "More accurate than standard K-Means", "Works only on small datasets" ], "correctAnswerIndex": 0, "explanation": "Mini-Batch K-Means updates centroids using small random batches for efficiency on large datasets." }, { "id": 18, "questionText": "Scenario: K-Means initialization affects results. Solution?", "options": [ "Use single random centroid only", "Run algorithm multiple times or use K-Means++", "Ignore initialization", "Reduce K randomly" ], "correctAnswerIndex": 1, "explanation": "K-Means++ and multiple runs improve stability and reduce sensitivity to initial centroids." }, { "id": 19, "questionText": "Scenario: K-Means distance metric. Standard choice?", "options": [ "Euclidean distance", "Jaccard index", "Hamming distance", "Cosine similarity" ], "correctAnswerIndex": 0, "explanation": "K-Means typically uses Euclidean distance to assign points to nearest centroids." }, { "id": 20, "questionText": "Scenario: K-Means clustering produces empty clusters. Cause?", "options": [ "Algorithm failed", "Clusters are compact", "No points assigned to some centroids", "Centroids are optimal" ], "correctAnswerIndex": 2, "explanation": "Some centroids may not attract any points, leading to empty clusters." }, { "id": 21, "questionText": "Scenario: K-Means on text embeddings. Preprocessing required?", "options": [ "Use categorical K-Means directly", "No preprocessing needed", "Randomly assign clusters", "Normalize or scale vectors before clustering" ], "correctAnswerIndex": 3, "explanation": "Text embeddings often need normalization to prevent certain dimensions from dominating distance computations." }, { "id": 22, "questionText": "Scenario: K-Means with very high K. Effect on SSE (sum of squared errors)?", "options": [ "SSE increases", "SSE is ignored", "SSE decreases as K increases", "SSE remains constant" ], "correctAnswerIndex": 2, "explanation": "As K increases, each cluster contains fewer points, reducing the sum of squared distances." }, { "id": 23, "questionText": "Scenario: K-Means on scaled vs unscaled features. Effect?", "options": [ "Clusters merge randomly", "Scaling is important; features with large range dominate clustering", "Algorithm fails if not scaled", "Scaling is unnecessary" ], "correctAnswerIndex": 1, "explanation": "Features with larger numerical ranges can dominate Euclidean distance; scaling ensures fair contribution." }, { "id": 24, "questionText": "Scenario: K-Means convergence criteria. Standard check?", "options": [ "Centroid positions or cluster assignments stop changing", "Distance metric ignored", "Random stopping", "Maximum iterations only" ], "correctAnswerIndex": 0, "explanation": "Algorithm stops when centroids or cluster assignments stabilize, or after a max number of iterations." }, { "id": 25, "questionText": "Scenario: K-Means for image compression. How?", "options": [ "Cluster pixel colors and replace each pixel by its centroid", "Use hierarchical clustering", "Remove noise automatically", "Reduce image resolution" ], "correctAnswerIndex": 0, "explanation": "K-Means clusters similar colors, allowing image compression by using cluster centroids as representative colors." }, { "id": 26, "questionText": "Scenario: K-Means for customer segmentation. Benefit?", "options": [ "Identify customer groups for targeted marketing", "Automatically predicts sales", "Detects trends over time", "Removes outliers" ], "correctAnswerIndex": 0, "explanation": "K-Means helps segment customers based on behavior, allowing targeted campaigns." }, { "id": 27, "questionText": "Scenario: K-Means clustering results differ on repeated runs. Cause?", "options": [ "Distance metric varies", "Random initialization of centroids", "Algorithm deterministic", "Clusters merge randomly" ], "correctAnswerIndex": 1, "explanation": "Random initial centroids can lead to different final clusters; K-Means++ mitigates this." }, { "id": 28, "questionText": "Scenario: K-Means for anomaly detection. Approach?", "options": [ "Clusters merge randomly", "Points far from nearest centroid may be anomalies", "All points treated equally", "Noise automatically ignored" ], "correctAnswerIndex": 1, "explanation": "Outliers are detected as points distant from cluster centroids." }, { "id": 29, "questionText": "Scenario: K-Means clustering on geospatial data. Best practice?", "options": [ "Use distance metric appropriate for coordinates (e.g., haversine)", "Randomly assign clusters", "Use Euclidean blindly", "Clusters merge arbitrarily" ], "correctAnswerIndex": 0, "explanation": "Euclidean distance may misrepresent geographic distances; use geodesic metrics like haversine." }, { "id": 30, "questionText": "Scenario: K-Means with highly correlated features. Solution?", "options": [ "Increase K randomly", "Apply PCA to reduce correlated dimensions", "Ignore correlations", "Clusters merge arbitrarily" ], "correctAnswerIndex": 1, "explanation": "PCA reduces correlated features and improves clustering performance." }, { "id": 31, "questionText": "Scenario: K-Means on non-spherical clusters. Limitation?", "options": [ "Noise ignored", "Clusters are always compact", "Algorithm automatically adapts", "K-Means assumes spherical clusters; non-spherical clusters may be poorly separated" ], "correctAnswerIndex": 3, "explanation": "K-Means relies on Euclidean distance and assumes roughly spherical clusters, so elongated or irregular clusters are not well captured." }, { "id": 32, "questionText": "Scenario: K-Means clustering produces clusters with very different densities. Challenge?", "options": [ "Distance metric ignored", "Clusters always equal", "Low-density clusters may be merged incorrectly", "Algorithm detects densities automatically" ], "correctAnswerIndex": 2, "explanation": "K-Means does not account for density; clusters with differing densities may not be separated properly." }, { "id": 33, "questionText": "Scenario: K-Means applied to text embeddings. Best practice?", "options": [ "Increase K arbitrarily", "Normalize embeddings to unit vectors before clustering", "Remove half the features randomly", "Use raw embeddings" ], "correctAnswerIndex": 1, "explanation": "Normalization ensures that distance computation reflects angle similarity rather than magnitude differences." }, { "id": 34, "questionText": "Scenario: K-Means clustering with missing values. Approach?", "options": [ "Ignore missing values", "Randomly assign missing values", "Algorithm automatically handles them", "Impute missing values before clustering" ], "correctAnswerIndex": 3, "explanation": "K-Means requires complete numerical data; missing values should be imputed or removed." }, { "id": 35, "questionText": "Scenario: K-Means clustering on multi-dimensional customer features. Preprocessing step?", "options": [ "Randomly drop features", "Scale features so all dimensions contribute equally", "Increase K arbitrarily", "Leave features unscaled" ], "correctAnswerIndex": 1, "explanation": "Feature scaling ensures that dimensions with larger ranges do not dominate Euclidean distance." }, { "id": 36, "questionText": "Scenario: K-Means on a dataset with outliers. Solution?", "options": [ "Use standard K-Means without changes", "Randomly assign clusters", "Increase K to compensate", "Remove or preprocess outliers before clustering" ], "correctAnswerIndex": 3, "explanation": "Outliers can distort centroids; preprocessing improves clustering accuracy." }, { "id": 37, "questionText": "Scenario: K-Means convergence too slow. Solution?", "options": [ "Ignore convergence", "Change distance metric arbitrarily", "Use Mini-Batch K-Means or reduce dataset size", "Increase K randomly" ], "correctAnswerIndex": 2, "explanation": "Mini-Batch K-Means or subsampling speeds up convergence for large datasets." }, { "id": 38, "questionText": "Scenario: K-Means clustering with highly correlated features. Best approach?", "options": [ "Increase K", "Ignore correlation", "Merge clusters arbitrarily", "Apply PCA or feature selection to reduce redundancy" ], "correctAnswerIndex": 3, "explanation": "Reducing correlated dimensions prevents redundant information from biasing distance calculations." }, { "id": 39, "questionText": "Scenario: K-Means clustering on skewed data. Issue?", "options": [ "Noise ignored", "Algorithm corrects automatically", "Clusters may be biased towards dense regions", "Clusters always balanced" ], "correctAnswerIndex": 2, "explanation": "Skewed distributions can lead to unequal cluster sizes or poorly defined boundaries." }, { "id": 40, "questionText": "Scenario: K-Means with K unknown. Methods to select K?", "options": [ "Random choice", "Algorithm decides automatically", "Use maximum data points", "Elbow method, silhouette score, gap statistic" ], "correctAnswerIndex": 3, "explanation": "These methods help determine optimal K by evaluating clustering performance." }, { "id": 41, "questionText": "Scenario: K-Means produces very similar clusters on repeated runs. Possible reason?", "options": [ "Algorithm converged incorrectly", "Data naturally forms stable clusters", "Distance metric is wrong", "Initialization randomization failed" ], "correctAnswerIndex": 1, "explanation": "If data has well-separated clusters, K-Means results are stable across runs." }, { "id": 42, "questionText": "Scenario: K-Means on a small dataset with large K. Risk?", "options": [ "Algorithm fails completely", "Centroids ignored", "Clusters may be too small or empty", "Clusters automatically merge" ], "correctAnswerIndex": 2, "explanation": "Too many clusters for small datasets can produce meaningless or empty clusters." }, { "id": 43, "questionText": "Scenario: K-Means++ initialization. Benefit?", "options": [ "Improves cluster quality by selecting distant initial centroids", "Random initialization", "Always produces identical clusters", "Removes noise automatically" ], "correctAnswerIndex": 0, "explanation": "K-Means++ reduces poor initialization by spreading centroids apart." }, { "id": 44, "questionText": "Scenario: K-Means with categorical features. Solution?", "options": [ "Use K-Prototypes or encode categories numerically", "Clusters merge randomly", "Ignore categorical data", "Use standard K-Means directly" ], "correctAnswerIndex": 0, "explanation": "Standard K-Means cannot handle categorical data; K-Prototypes or encoding is needed." }, { "id": 45, "questionText": "Scenario: K-Means on noisy sensor data. Best practice?", "options": [ "Use raw data", "Increase K arbitrarily", "Filter or preprocess noise before clustering", "Ignore convergence" ], "correctAnswerIndex": 2, "explanation": "Noise affects centroids and cluster assignment; preprocessing improves results." }, { "id": 46, "questionText": "Scenario: K-Means for image segmentation. Metric for colors?", "options": [ "Cosine similarity", "Euclidean distance in RGB or LAB space", "Hamming distance", "Jaccard index" ], "correctAnswerIndex": 1, "explanation": "Euclidean distance is standard for numerical pixel features in color space." }, { "id": 47, "questionText": "Scenario: K-Means convergence to local minimum. Reason?", "options": [ "Poor initialization of centroids", "Algorithm always finds global minimum", "Clusters are too compact", "Distance metric is incorrect" ], "correctAnswerIndex": 0, "explanation": "Random initial centroids can lead K-Means to converge to suboptimal local minima." }, { "id": 48, "questionText": "Scenario: K-Means clustering with overlapping clusters. Limitation?", "options": [ "Clusters merge automatically", "Algorithm adapts perfectly", "Cannot clearly separate overlapping clusters", "Noise ignored" ], "correctAnswerIndex": 2, "explanation": "K-Means relies on distance; overlapping clusters may not be correctly assigned." }, { "id": 49, "questionText": "Scenario: K-Means for market segmentation. Use case?", "options": [ "Remove outliers automatically", "Identify customer groups for targeted campaigns", "Predict stock prices", "Visualize time series" ], "correctAnswerIndex": 1, "explanation": "K-Means clusters similar customers to enable targeted marketing strategies." }, { "id": 50, "questionText": "Scenario: K-Means for anomaly detection in credit card transactions. Approach?", "options": [ "Transactions far from cluster centroids may be fraudulent", "All transactions treated equally", "Clusters merge automatically", "Noise ignored" ], "correctAnswerIndex": 0, "explanation": "Outliers distant from normal clusters can indicate anomalous or fraudulent activity." }, { "id": 51, "questionText": "Scenario: K-Means on high-dimensional gene expression data. Best practice?", "options": [ "Clusters merge randomly", "Use raw high-dimensional data directly", "Increase K arbitrarily", "Use PCA or dimensionality reduction before clustering" ], "correctAnswerIndex": 3, "explanation": "Dimensionality reduction helps meaningful clustering and avoids distance dilution." }, { "id": 52, "questionText": "Scenario: K-Means on very large dataset. Speed-up technique?", "options": [ "Mini-Batch K-Means", "Increase K", "Ignore convergence", "Use raw data" ], "correctAnswerIndex": 0, "explanation": "Mini-Batch K-Means updates centroids using batches, reducing computation time." }, { "id": 53, "questionText": "Scenario: K-Means applied to IoT sensor data with missing values. Solution?", "options": [ "Impute missing values before clustering", "Remove entire dataset", "Assign clusters randomly", "Ignore missing values" ], "correctAnswerIndex": 0, "explanation": "K-Means requires complete numerical data; missing values must be handled prior to clustering." }, { "id": 54, "questionText": "Scenario: K-Means applied to customer purchase history. Challenge?", "options": [ "Clusters automatically balanced", "Sparse purchase data may lead to poor cluster separation", "Algorithm converges perfectly", "Noise ignored" ], "correctAnswerIndex": 1, "explanation": "Sparse or high-dimensional data can reduce clustering accuracy; preprocessing helps." }, { "id": 55, "questionText": "Scenario: K-Means with categorical features encoded as numbers. Risk?", "options": [ "Algorithm works perfectly", "Clusters merge automatically", "Noise ignored", "Numerical encoding may introduce artificial distance relationships" ], "correctAnswerIndex": 3, "explanation": "Direct numeric encoding of categorical data can misrepresent similarity between categories." }, { "id": 56, "questionText": "Scenario: K-Means for spatial clustering of stores. Best practice?", "options": [ "Increase K arbitrarily", "Clusters merge randomly", "Use raw coordinates directly", "Normalize coordinates or use appropriate distance metric" ], "correctAnswerIndex": 3, "explanation": "Scaling ensures coordinates are comparable and distance computations are accurate." }, { "id": 57, "questionText": "Scenario: K-Means produces poor clustering. Possible reason?", "options": [ "Data not suitable for K-Means (non-spherical or overlapping)", "Centroids incorrect", "Algorithm always finds perfect clusters", "Distance metric irrelevant" ], "correctAnswerIndex": 0, "explanation": "K-Means struggles with non-spherical or overlapping clusters." }, { "id": 58, "questionText": "Scenario: K-Means clustering on scaled features. Advantage?", "options": [ "Distance metric changes", "Clusters merge automatically", "Prevents dominance by features with large range", "Algorithm ignores scaling" ], "correctAnswerIndex": 2, "explanation": "Scaling ensures each feature contributes equally to Euclidean distance calculations." }, { "id": 59, "questionText": "Scenario: K-Means with clusters of unequal variance. Issue?", "options": [ "Noise ignored", "Algorithm automatically adjusts", "Clusters always compact", "Clusters may not accurately represent data structure" ], "correctAnswerIndex": 3, "explanation": "K-Means assumes similar variance; large differences affect cluster quality." }, { "id": 60, "questionText": "Scenario: K-Means applied to time-series data. Approach?", "options": [ "Use raw sequences directly", "Increase K arbitrarily", "Clusters merge randomly", "Extract meaningful features before clustering" ], "correctAnswerIndex": 3, "explanation": "Feature extraction ensures distance metrics are meaningful for time-series clustering." }, { "id": 61, "questionText": "Scenario: K-Means clusters overlap. Evaluation metric?", "options": [ "Use SSE only", "Clusters merge randomly", "Ignore overlap", "Silhouette score measures separation and cohesion" ], "correctAnswerIndex": 3, "explanation": "Silhouette score evaluates how well points fit within their clusters vs others." }, { "id": 62, "questionText": "Scenario: K-Means with too few clusters. Result?", "options": [ "Clusters may merge dissimilar points, reducing interpretability", "Algorithm adapts automatically", "Clusters always compact", "Noise ignored" ], "correctAnswerIndex": 0, "explanation": "Too small K forces dissimilar points into same cluster, reducing accuracy." }, { "id": 63, "questionText": "Scenario: K-Means for market basket analysis. Limitation?", "options": [ "Algorithm works perfectly", "Noise ignored", "Clusters merge automatically", "Sparse and categorical data requires encoding or alternate methods" ], "correctAnswerIndex": 3, "explanation": "Sparse categorical data needs careful preprocessing or K-Prototypes instead of K-Means." }, { "id": 64, "questionText": "Scenario: K-Means produces empty clusters. Solution?", "options": [ "Ignore empty clusters", "Algorithm fails automatically", "Increase K randomly", "Reinitialize centroids or reduce K" ], "correctAnswerIndex": 3, "explanation": "Reassigning centroids or reducing K resolves empty clusters." }, { "id": 65, "questionText": "Scenario: K-Means with high-dimensional data. Challenge?", "options": [ "Distance metrics lose meaning; reduce dimensions", "Algorithm adapts automatically", "Clusters always accurate", "Noise ignored" ], "correctAnswerIndex": 0, "explanation": "High dimensions dilute distances, making clustering unreliable without dimensionality reduction." }, { "id": 66, "questionText": "Scenario: K-Means on normalized vs unnormalized features. Effect?", "options": [ "Algorithm automatically scales", "Normalization ensures fair distance contribution across features", "Unnormalized always better", "Clusters merge randomly" ], "correctAnswerIndex": 1, "explanation": "Normalized features prevent features with large ranges from dominating clustering." }, { "id": 67, "questionText": "Scenario: K-Means++ vs random initialization. Advantage?", "options": [ "Improves clustering stability and convergence", "Random initialization always better", "No difference in results", "Removes noise automatically" ], "correctAnswerIndex": 0, "explanation": "K-Means++ selects initial centroids to reduce poor local minima." }, { "id": 68, "questionText": "Scenario: K-Means on customer purchase amounts. Data skewed. Solution?", "options": [ "Use raw data", "Log-transform or scale data before clustering", "Increase K", "Ignore skew" ], "correctAnswerIndex": 1, "explanation": "Transforming skewed data prevents high-value points from dominating clustering." }, { "id": 69, "questionText": "Scenario: K-Means on text data after TF-IDF. Challenge?", "options": [ "Algorithm works perfectly", "High-dimensional sparse vectors; dimensionality reduction recommended", "Noise ignored", "Clusters merge randomly" ], "correctAnswerIndex": 1, "explanation": "Sparse high-dimensional TF-IDF vectors may reduce clustering effectiveness without reduction." }, { "id": 70, "questionText": "Scenario: K-Means applied to IoT device readings. Best practice?", "options": [ "Increase K randomly", "Use raw readings", "Ignore convergence", "Normalize or scale features to ensure meaningful clustering" ], "correctAnswerIndex": 3, "explanation": "Scaling ensures that features contribute equally to distance calculations for clustering." }, { "id": 71, "questionText": "Scenario: K-Means clustering applied to gene expression data with thousands of features. Best approach?", "options": [ "Apply PCA or feature selection to reduce dimensionality before clustering", "Randomly remove features", "Increase K arbitrarily", "Use all features directly" ], "correctAnswerIndex": 0, "explanation": "High-dimensional gene data can dilute distances; dimensionality reduction ensures meaningful clusters." }, { "id": 72, "questionText": "Scenario: K-Means with very large K relative to dataset size. Risk?", "options": [ "Clusters may be meaningless or empty", "Algorithm automatically adjusts", "Distance metric ignored", "Clusters merge automatically" ], "correctAnswerIndex": 0, "explanation": "Too many clusters can lead to tiny or empty clusters with no interpretability." }, { "id": 73, "questionText": "Scenario: K-Means on data with non-uniform density clusters. Limitation?", "options": [ "Noise ignored", "Low-density clusters may merge with high-density ones", "Algorithm adjusts automatically", "Clusters always compact" ], "correctAnswerIndex": 1, "explanation": "K-Means does not handle varying densities well; denser clusters dominate centroid assignment." }, { "id": 74, "questionText": "Scenario: K-Means on highly skewed financial transaction data. Best preprocessing?", "options": [ "Apply log transformation to reduce skew before clustering", "Use raw data", "Clusters merge randomly", "Increase K arbitrarily" ], "correctAnswerIndex": 0, "explanation": "Log or other transformations reduce the effect of extreme values, improving clustering quality." }, { "id": 75, "questionText": "Scenario: K-Means on time-series data. Effective method?", "options": [ "Extract meaningful features such as trends or seasonal components before clustering", "Use raw sequences directly", "Clusters merge automatically", "Increase K randomly" ], "correctAnswerIndex": 0, "explanation": "Feature extraction ensures distances reflect meaningful similarities in time-series." }, { "id": 76, "questionText": "Scenario: K-Means clustering for anomaly detection in network traffic. Strategy?", "options": [ "All points treated equally", "Points far from cluster centroids are likely anomalies", "Noise ignored", "Clusters merge automatically" ], "correctAnswerIndex": 1, "explanation": "Outliers distant from normal traffic clusters are potential anomalies." }, { "id": 77, "questionText": "Scenario: K-Means applied to image color compression. Challenge?", "options": [ "Algorithm automatically selects K", "All clusters identical", "Noise ignored", "Choosing optimal K to balance compression and image quality" ], "correctAnswerIndex": 3, "explanation": "Selecting K is critical; too few clusters lose color details, too many reduce compression." }, { "id": 78, "questionText": "Scenario: K-Means++ vs multiple random initializations. Advantage of K-Means++?", "options": [ "Reduces likelihood of poor local minima and improves convergence", "Removes noise automatically", "Random initializations are better", "No difference in results" ], "correctAnswerIndex": 0, "explanation": "K-Means++ selects initial centroids that are distant, improving stability and cluster quality." }, { "id": 79, "questionText": "Scenario: K-Means applied to sparse TF-IDF text vectors. Best approach?", "options": [ "Use raw sparse vectors directly", "Increase K arbitrarily", "Reduce dimensionality using techniques like Truncated SVD before clustering", "Clusters merge randomly" ], "correctAnswerIndex": 2, "explanation": "High-dimensional sparse data may produce poor clusters; dimensionality reduction improves performance." }, { "id": 80, "questionText": "Scenario: K-Means clustering with overlapping spherical clusters. How to improve?", "options": [ "K-Means always works", "Clusters merge automatically", "Reduce K randomly", "Use Gaussian Mixture Models (GMM) for soft clustering" ], "correctAnswerIndex": 3, "explanation": "GMM can model cluster overlap using probability distributions, unlike hard K-Means assignments." }, { "id": 81, "questionText": "Scenario: K-Means for customer segmentation with categorical attributes. Best practice?", "options": [ "Use K-Prototypes or encode categories numerically", "Ignore categorical data", "Use standard K-Means directly", "Clusters merge randomly" ], "correctAnswerIndex": 0, "explanation": "K-Prototypes handles mixed numerical and categorical data effectively." }, { "id": 82, "questionText": "Scenario: K-Means convergence to local minimum. Cause?", "options": [ "Distance metric incorrect", "Clusters too compact", "Poor or random initialization of centroids", "Algorithm always finds global minimum" ], "correctAnswerIndex": 2, "explanation": "K-Means may converge to suboptimal solutions depending on initial centroids." }, { "id": 83, "questionText": "Scenario: K-Means applied to geospatial clustering. Recommendation?", "options": [ "Increase K randomly", "Use appropriate distance metrics like haversine for coordinates", "Clusters merge arbitrarily", "Use Euclidean distance blindly" ], "correctAnswerIndex": 1, "explanation": "Geographic distances require correct metric to ensure accurate clustering." }, { "id": 84, "questionText": "Scenario: K-Means with very large datasets. Efficient solution?", "options": [ "Use Mini-Batch K-Means", "Increase K arbitrarily", "Ignore convergence", "Use full dataset only" ], "correctAnswerIndex": 0, "explanation": "Mini-Batch K-Means speeds up computation by using small random batches for centroid updates." }, { "id": 85, "questionText": "Scenario: K-Means on noisy IoT sensor data. Best preprocessing?", "options": [ "Filter or smooth noise before clustering", "Clusters merge automatically", "Increase K arbitrarily", "Use raw data" ], "correctAnswerIndex": 0, "explanation": "Noise can distort centroids; preprocessing improves clustering reliability." }, { "id": 86, "questionText": "Scenario: K-Means on very high-dimensional data. Limitation?", "options": [ "Distance metrics lose meaning; dimensionality reduction recommended", "Noise ignored", "Clusters always accurate", "Algorithm adapts automatically" ], "correctAnswerIndex": 0, "explanation": "High-dimensional spaces dilute distances, leading to poor cluster assignments." }, { "id": 87, "questionText": "Scenario: K-Means for anomaly detection in healthcare data. Approach?", "options": [ "Noise ignored", "Points far from cluster centroids may indicate anomalies", "Clusters merge automatically", "All points treated equally" ], "correctAnswerIndex": 1, "explanation": "Outliers distant from normal clusters can indicate anomalies or rare events." }, { "id": 88, "questionText": "Scenario: K-Means on image segmentation with varying illumination. Challenge?", "options": [ "Preprocessing like normalization is needed to reduce lighting effect", "Algorithm works perfectly", "Increase K randomly", "Clusters merge automatically" ], "correctAnswerIndex": 0, "explanation": "Differences in lighting affect pixel values; normalization improves clustering consistency." }, { "id": 89, "questionText": "Scenario: K-Means for market segmentation with mixed purchase behavior. Solution?", "options": [ "Ignore categorical data", "Clusters merge randomly", "Use numerical encoding or K-Prototypes for categorical and numerical features", "Use standard K-Means directly" ], "correctAnswerIndex": 2, "explanation": "Mixed data requires specialized clustering methods for meaningful segmentation." }, { "id": 90, "questionText": "Scenario: K-Means clustering produces empty clusters repeatedly. Best solution?", "options": [ "Algorithm fails automatically", "Increase K arbitrarily", "Ignore empty clusters", "Reinitialize centroids or reduce K" ], "correctAnswerIndex": 3, "explanation": "Empty clusters occur when centroids have no assigned points; reinitialization or lowering K resolves this." }, { "id": 91, "questionText": "Scenario: K-Means applied to highly imbalanced datasets. Issue?", "options": [ "Large clusters may dominate, small clusters underrepresented", "Clusters always balanced", "Noise ignored", "Algorithm adapts automatically" ], "correctAnswerIndex": 0, "explanation": "K-Means does not account for cluster size; imbalance may distort results." }, { "id": 92, "questionText": "Scenario: K-Means applied to network traffic logs for intrusion detection. Best approach?", "options": [ "Use raw logs directly", "Increase K arbitrarily", "Preprocess logs into numerical features and detect points far from centroids", "Clusters merge automatically" ], "correctAnswerIndex": 2, "explanation": "Transforming logs to numerical vectors enables clustering and anomaly detection." }, { "id": 93, "questionText": "Scenario: K-Means clustering with multiple valid K values. Evaluation metric?", "options": [ "Silhouette score to evaluate cluster quality", "Ignore K selection", "Use SSE only", "Clusters merge randomly" ], "correctAnswerIndex": 0, "explanation": "Silhouette score measures cohesion and separation, helping choose optimal K." }, { "id": 94, "questionText": "Scenario: K-Means applied to text clustering using word embeddings. Limitation?", "options": [ "High-dimensional vectors may require dimensionality reduction or normalization", "Clusters merge randomly", "Algorithm works perfectly", "Noise ignored" ], "correctAnswerIndex": 0, "explanation": "Dimensionality reduction and normalization improve clustering accuracy for embeddings." }, { "id": 95, "questionText": "Scenario: K-Means clustering results vary on repeated runs. Best solution?", "options": [ "Use K-Means++ initialization or multiple runs", "Clusters merge randomly", "Ignore variations", "Increase K arbitrarily" ], "correctAnswerIndex": 0, "explanation": "Better initialization reduces sensitivity to random centroid placement." }, { "id": 96, "questionText": "Scenario: K-Means on scaled features vs unscaled features. Observation?", "options": [ "Scaling ensures fair contribution of all features to distance calculation", "Clusters merge randomly", "Algorithm adapts automatically", "Scaling is unnecessary" ], "correctAnswerIndex": 0, "explanation": "Without scaling, features with larger ranges dominate cluster assignments." }, { "id": 97, "questionText": "Scenario: K-Means clustering on overlapping clusters. Alternative?", "options": [ "Reduce K randomly", "Clusters merge automatically", "Use soft clustering like Gaussian Mixture Models", "K-Means handles overlap perfectly" ], "correctAnswerIndex": 2, "explanation": "Soft clustering models allow points to belong probabilistically to multiple clusters." }, { "id": 98, "questionText": "Scenario: K-Means applied to sensor network data with missing values. Solution?", "options": [ "Impute missing values before clustering", "Assign clusters randomly", "Remove entire dataset", "Ignore missing values" ], "correctAnswerIndex": 0, "explanation": "K-Means requires complete data; missing values must be handled prior to clustering." }, { "id": 99, "questionText": "Scenario: K-Means on customer behavior data with high variance features. Best approach?", "options": [ "Increase K arbitrarily", "Clusters merge randomly", "Use raw data", "Scale or normalize features to prevent dominance by high-variance features" ], "correctAnswerIndex": 3, "explanation": "Scaling ensures fair contribution of each feature to distance computation." }, { "id": 100, "questionText": "Scenario: K-Means applied to a large dataset with many outliers. Recommendation?", "options": [ "Increase K arbitrarily", "Clusters merge automatically", "Use raw data directly", "Preprocess to remove or handle outliers before clustering" ], "correctAnswerIndex": 3, "explanation": "Outliers distort centroids; preprocessing ensures meaningful cluster assignments." } ] }