| { | |
| "title": "LightGBM Mastery: Part 1 (Q1–25)", | |
| "description": "Questions 1–25 covering LightGBM fundamentals — boosting basics, leaf-wise growth, histogram optimization, and key parameters.", | |
| "questions": [ | |
| { | |
| "id": 1, | |
| "questionText": "What type of algorithm is LightGBM based on?", | |
| "options": [ | |
| "Bagging", | |
| "Boosting", | |
| "Stacking", | |
| "Voting" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "LightGBM is a boosting algorithm, specifically a gradient boosting framework that builds models sequentially." | |
| }, | |
| { | |
| "id": 2, | |
| "questionText": "Which of the following is a unique characteristic of LightGBM?", | |
| "options": [ | |
| "It grows trees level-wise", | |
| "It grows trees leaf-wise", | |
| "It uses deep neural networks", | |
| "It averages models" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "LightGBM grows trees leaf-wise (best-first) to reduce loss more efficiently compared to level-wise methods." | |
| }, | |
| { | |
| "id": 3, | |
| "questionText": "What type of trees does LightGBM primarily use?", | |
| "options": [ | |
| "Shallow random trees", | |
| "Deep neural trees", | |
| "Decision trees", | |
| "Regression trees" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "LightGBM primarily uses regression trees for both classification and regression tasks." | |
| }, | |
| { | |
| "id": 4, | |
| "questionText": "Which technique allows LightGBM to handle large datasets efficiently?", | |
| "options": [ | |
| "Feature hashing", | |
| "Histogram-based algorithm", | |
| "Random sampling", | |
| "PCA compression" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "LightGBM uses a histogram-based algorithm to reduce computation by discretizing continuous features into bins." | |
| }, | |
| { | |
| "id": 5, | |
| "questionText": "Scenario: You have very large dataset with millions of rows. Why is LightGBM preferred over XGBoost?", | |
| "options": [ | |
| "Because it uses neural networks internally", | |
| "Because it uses histogram-based splits and leaf-wise growth for efficiency", | |
| "Because it reduces model interpretability", | |
| "Because it doesn’t require gradient computation" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "LightGBM is optimized for large datasets using histogram-based splits and leaf-wise tree growth." | |
| }, | |
| { | |
| "id": 6, | |
| "questionText": "Which LightGBM parameter controls the number of leaves in a single tree?", | |
| "options": [ | |
| "num_leaves", | |
| "max_depth", | |
| "min_child_samples", | |
| "n_estimators" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The num_leaves parameter sets the maximum number of leaves in one tree, controlling model complexity." | |
| }, | |
| { | |
| "id": 7, | |
| "questionText": "What happens if num_leaves is set too high?", | |
| "options": [ | |
| "Model becomes underfit", | |
| "Model becomes overfit", | |
| "Model trains faster", | |
| "Model ignores small features" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Too many leaves can make the model overly complex and prone to overfitting." | |
| }, | |
| { | |
| "id": 8, | |
| "questionText": "Which LightGBM parameter controls the learning rate?", | |
| "options": [ | |
| "shrinkage_rate", | |
| "alpha", | |
| "learning_rate", | |
| "lambda" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The learning_rate parameter determines how much each new tree contributes to the model." | |
| }, | |
| { | |
| "id": 9, | |
| "questionText": "Scenario: Model accuracy stagnates early during training. Which parameter can you increase?", | |
| "options": [ | |
| "num_leaves", | |
| "learning_rate", | |
| "n_estimators", | |
| "feature_fraction" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Increasing n_estimators (number of boosting iterations) allows the model to learn longer." | |
| }, | |
| { | |
| "id": 10, | |
| "questionText": "What is feature_fraction used for in LightGBM?", | |
| "options": [ | |
| "Regularization to reduce overfitting by randomly selecting a fraction of features per tree", | |
| "Adjusting leaf size", | |
| "Reducing data size by sampling rows", | |
| "Controlling the number of leaves" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "feature_fraction randomly selects a fraction of features to train each tree, helping regularize the model." | |
| }, | |
| { | |
| "id": 11, | |
| "questionText": "Which LightGBM parameter limits tree depth?", | |
| "options": [ | |
| "max_depth", | |
| "num_leaves", | |
| "min_split_gain", | |
| "subsample" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "max_depth directly limits how deep trees can grow, preventing overfitting." | |
| }, | |
| { | |
| "id": 12, | |
| "questionText": "What does min_child_samples control?", | |
| "options": [ | |
| "Minimum number of samples required in a leaf", | |
| "Minimum number of features used in a tree", | |
| "Minimum iterations before early stopping", | |
| "Minimum value for learning rate" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "min_child_samples ensures that leaves have enough data points, acting as a regularization technique." | |
| }, | |
| { | |
| "id": 13, | |
| "questionText": "Scenario: LightGBM model is overfitting. Which change helps reduce it?", | |
| "options": [ | |
| "Decrease num_leaves", | |
| "Increase num_leaves", | |
| "Increase learning rate", | |
| "Remove regularization" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reducing num_leaves decreases model complexity and helps combat overfitting." | |
| }, | |
| { | |
| "id": 14, | |
| "questionText": "What does boosting_type='dart' mean in LightGBM?", | |
| "options": [ | |
| "It uses Dropouts meet Multiple Additive Regression Trees", | |
| "It disables boosting", | |
| "It performs bagging only", | |
| "It builds random forests" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The DART variant of LightGBM randomly drops trees during boosting to improve generalization." | |
| }, | |
| { | |
| "id": 15, | |
| "questionText": "Which LightGBM boosting type uses dropouts for regularization?", | |
| "options": [ | |
| "gbdt", | |
| "dart", | |
| "goss", | |
| "rf" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "The DART boosting type introduces dropout in boosting to prevent overfitting." | |
| }, | |
| { | |
| "id": 16, | |
| "questionText": "What does goss stand for in LightGBM?", | |
| "options": [ | |
| "Gradient-based One-Side Sampling", | |
| "Gradient Optimization Sampling System", | |
| "Global Outlier Sampling Strategy", | |
| "Generalized Optimization Split Search" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "GOSS is Gradient-based One-Side Sampling — a LightGBM optimization that speeds up training by sampling instances with large gradients." | |
| }, | |
| { | |
| "id": 17, | |
| "questionText": "Which LightGBM parameter helps in row subsampling?", | |
| "options": [ | |
| "bagging_fraction", | |
| "feature_fraction", | |
| "lambda_l1", | |
| "min_split_gain" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "bagging_fraction controls the fraction of data used per iteration, providing row-wise subsampling." | |
| }, | |
| { | |
| "id": 18, | |
| "questionText": "Scenario: You want faster training but can tolerate a small loss in accuracy. Which parameter can you reduce?", | |
| "options": [ | |
| "bagging_fraction", | |
| "num_leaves", | |
| "max_depth", | |
| "feature_fraction" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reducing bagging_fraction increases speed by using fewer data rows per iteration." | |
| }, | |
| { | |
| "id": 19, | |
| "questionText": "What does lambda_l1 control in LightGBM?", | |
| "options": [ | |
| "L1 regularization term on weights", | |
| "L2 regularization term on weights", | |
| "Dropout rate", | |
| "Learning rate decay" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "lambda_l1 adds L1 regularization on leaf weights to encourage sparsity and reduce overfitting." | |
| }, | |
| { | |
| "id": 20, | |
| "questionText": "Which LightGBM regularization term penalizes large leaf weights using L2 norm?", | |
| "options": [ | |
| "lambda_l1", | |
| "lambda_l2", | |
| "min_child_samples", | |
| "feature_fraction" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "lambda_l2 applies L2 regularization to prevent large leaf weights and stabilize training." | |
| }, | |
| { | |
| "id": 21, | |
| "questionText": "Scenario: Model accuracy is fluctuating during boosting. Which parameter helps smooth this effect?", | |
| "options": [ | |
| "learning_rate", | |
| "num_leaves", | |
| "min_gain_to_split", | |
| "bagging_freq" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "A smaller learning_rate helps stabilize model updates, reducing fluctuations." | |
| }, | |
| { | |
| "id": 22, | |
| "questionText": "What is the role of min_gain_to_split?", | |
| "options": [ | |
| "Minimum loss reduction required for a split", | |
| "Minimum number of leaves required per tree", | |
| "Maximum number of features allowed", | |
| "Learning rate decay factor" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "min_gain_to_split prevents small, insignificant splits by requiring a minimum loss reduction." | |
| }, | |
| { | |
| "id": 23, | |
| "questionText": "Scenario: Dataset contains categorical variables. How does LightGBM handle them efficiently?", | |
| "options": [ | |
| "Using one-hot encoding automatically", | |
| "By internally converting them using optimal split algorithms", | |
| "By ignoring categorical variables", | |
| "By treating them as numeric values directly" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "LightGBM natively supports categorical features by finding optimal split points without full one-hot encoding." | |
| }, | |
| { | |
| "id": 24, | |
| "questionText": "Which parameter in LightGBM is used to handle categorical features?", | |
| "options": [ | |
| "categorical_feature", | |
| "cat_var", | |
| "cat_split", | |
| "categorical_index" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The categorical_feature parameter specifies which columns are treated as categorical during training." | |
| }, | |
| { | |
| "id": 25, | |
| "questionText": "Scenario: LightGBM is using GPU for training. Which advantage does this offer?", | |
| "options": [ | |
| "Faster histogram construction and split finding", | |
| "Automatic feature engineering", | |
| "Better interpretability", | |
| "Improved regularization" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "GPU acceleration speeds up histogram creation and split computations, significantly reducing training time." | |
| } | |
| ] | |
| } | |
| { | |
| "title": "LightGBM Mastery: Part 2 (Q26–50)", | |
| "description": "Questions 26–50 exploring LightGBM tuning — sampling, regularization, parameter interactions, parallelization, and practical training strategies.", | |
| "questions": [ | |
| { | |
| "id": 26, | |
| "questionText": "What does the bagging_freq parameter control in LightGBM?", | |
| "options": [ | |
| "How frequently bagging is performed during training", | |
| "Number of features to drop per tree", | |
| "Learning rate schedule frequency", | |
| "Regularization update interval" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "bagging_freq determines after how many boosting iterations LightGBM performs row subsampling." | |
| }, | |
| { | |
| "id": 27, | |
| "questionText": "Which LightGBM optimization helps in handling large-scale datasets efficiently?", | |
| "options": [ | |
| "Histogram-based binning", | |
| "Deep tree expansion", | |
| "Dynamic pruning", | |
| "Recurrent boosting" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "LightGBM uses histogram-based binning to reduce computation and memory usage for large datasets." | |
| }, | |
| { | |
| "id": 28, | |
| "questionText": "Scenario: Dataset contains extreme class imbalance. Which parameter helps mitigate it?", | |
| "options": [ | |
| "scale_pos_weight", | |
| "bagging_fraction", | |
| "num_leaves", | |
| "min_child_samples" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "scale_pos_weight adjusts the relative weight of positive samples to handle imbalanced datasets effectively." | |
| }, | |
| { | |
| "id": 29, | |
| "questionText": "Which LightGBM setting should be increased to make the model less sensitive to noise?", | |
| "options": [ | |
| "min_child_samples", | |
| "num_leaves", | |
| "learning_rate", | |
| "max_depth" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Increasing min_child_samples ensures leaves contain more data points, making the model more robust to noise." | |
| }, | |
| { | |
| "id": 30, | |
| "questionText": "What is the role of early_stopping_round in LightGBM training?", | |
| "options": [ | |
| "Stops training when validation loss does not improve after a certain number of rounds", | |
| "Reduces learning rate automatically", | |
| "Saves best iteration for retraining", | |
| "Increases number of leaves gradually" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "early_stopping_round halts training if performance on validation data stops improving." | |
| }, | |
| { | |
| "id": 31, | |
| "questionText": "Scenario: You observe that LightGBM trains very fast but underfits. What adjustment helps?", | |
| "options": [ | |
| "Increase num_leaves or n_estimators", | |
| "Decrease learning rate", | |
| "Reduce max_depth", | |
| "Reduce feature_fraction" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Increasing num_leaves or n_estimators allows the model to capture more complexity and reduce underfitting." | |
| }, | |
| { | |
| "id": 32, | |
| "questionText": "What does LightGBM’s leaf-wise tree growth mean?", | |
| "options": [ | |
| "It splits the leaf with the highest loss reduction first", | |
| "It splits all leaves at the same level simultaneously", | |
| "It grows the tree symmetrically", | |
| "It uses fixed depth trees" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Leaf-wise growth selects and splits the leaf that gives the greatest loss reduction, leading to faster convergence." | |
| }, | |
| { | |
| "id": 33, | |
| "questionText": "Which parameter combination most affects model complexity?", | |
| "options": [ | |
| "num_leaves and max_depth", | |
| "feature_fraction and bagging_fraction", | |
| "learning_rate and n_estimators", | |
| "lambda_l1 and lambda_l2" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "num_leaves and max_depth jointly control tree structure and hence the complexity of the model." | |
| }, | |
| { | |
| "id": 34, | |
| "questionText": "Scenario: LightGBM runs out of memory on a massive dataset. Which setting helps reduce memory usage?", | |
| "options": [ | |
| "Reduce max_bin", | |
| "Increase learning_rate", | |
| "Set boosting_type to dart", | |
| "Increase num_leaves" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reducing max_bin decreases the number of histogram bins, lowering memory requirements." | |
| }, | |
| { | |
| "id": 35, | |
| "questionText": "What does the parameter max_bin represent in LightGBM?", | |
| "options": [ | |
| "Maximum number of bins to bucket continuous features", | |
| "Maximum number of leaves per tree", | |
| "Maximum depth of trees", | |
| "Maximum iterations for convergence" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "max_bin determines how many discrete bins each feature will be divided into during histogram building." | |
| }, | |
| { | |
| "id": 36, | |
| "questionText": "Scenario: Model training takes too long. Which adjustment improves speed most effectively?", | |
| "options": [ | |
| "Reduce max_bin or use bagging_fraction < 1", | |
| "Increase num_leaves", | |
| "Reduce learning_rate only", | |
| "Increase regularization terms" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reducing max_bin or using smaller bagging_fraction reduces the dataset processed each iteration, speeding up training." | |
| }, | |
| { | |
| "id": 37, | |
| "questionText": "What is the primary drawback of leaf-wise tree growth?", | |
| "options": [ | |
| "Higher risk of overfitting on small data", | |
| "Slower convergence", | |
| "Worse performance on large datasets", | |
| "Poor categorical handling" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Leaf-wise growth can overfit on small datasets because it may produce very deep trees." | |
| }, | |
| { | |
| "id": 38, | |
| "questionText": "Which LightGBM parameter defines how many bins are created for each feature?", | |
| "options": [ | |
| "max_bin", | |
| "num_leaves", | |
| "feature_fraction", | |
| "max_depth" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "max_bin sets how finely continuous features are bucketed into discrete bins for histogram-based learning." | |
| }, | |
| { | |
| "id": 39, | |
| "questionText": "What type of regularization do lambda_l1 and lambda_l2 correspond to?", | |
| "options": [ | |
| "Lasso and Ridge regularization", | |
| "Elastic Net regularization", | |
| "Dropout regularization", | |
| "Tree pruning regularization" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "lambda_l1 and lambda_l2 implement Lasso (L1) and Ridge (L2) regularization respectively." | |
| }, | |
| { | |
| "id": 40, | |
| "questionText": "Scenario: You observe overfitting with high validation error. Which parameters help reduce it?", | |
| "options": [ | |
| "Increase min_child_samples, decrease num_leaves", | |
| "Increase learning_rate, increase num_leaves", | |
| "Reduce lambda_l2", | |
| "Increase feature_fraction" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Increasing min_child_samples and reducing num_leaves simplify the model, reducing overfitting." | |
| }, | |
| { | |
| "id": 41, | |
| "questionText": "What is the role of monotone_constraints in LightGBM?", | |
| "options": [ | |
| "Ensure certain features have monotonic relationships with the target", | |
| "Enforce equal feature importance", | |
| "Reduce overfitting using L2 regularization", | |
| "Apply monotonic normalization to inputs" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "monotone_constraints force LightGBM to maintain a monotonic relationship for specific features." | |
| }, | |
| { | |
| "id": 42, | |
| "questionText": "Scenario: You want reproducible results from LightGBM training. Which parameter helps?", | |
| "options": [ | |
| "random_state", | |
| "seed", | |
| "boosting_seed", | |
| "Any of the above" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Setting seed or random_state ensures deterministic behavior in LightGBM training." | |
| }, | |
| { | |
| "id": 43, | |
| "questionText": "Which LightGBM parameter defines the objective function?", | |
| "options": [ | |
| "objective", | |
| "metric", | |
| "boosting_type", | |
| "learning_rate" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The objective parameter defines the loss function that LightGBM optimizes, e.g., 'binary', 'regression'." | |
| }, | |
| { | |
| "id": 44, | |
| "questionText": "Scenario: You are using LightGBM for multi-class classification. What should the objective be set to?", | |
| "options": [ | |
| "multiclass", | |
| "multiclassova", | |
| "binary", | |
| "regression" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "For multi-class classification, objective='multiclass' should be used with num_class specified." | |
| }, | |
| { | |
| "id": 45, | |
| "questionText": "What does feature_pre_filter in LightGBM control?", | |
| "options": [ | |
| "Whether features are pre-screened before training", | |
| "Feature normalization", | |
| "Automatic feature selection during training", | |
| "Dropout of low importance features" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "feature_pre_filter determines if LightGBM filters out constant or low-variance features before training." | |
| }, | |
| { | |
| "id": 46, | |
| "questionText": "Scenario: You use categorical features in LightGBM. What advantage does native support provide?", | |
| "options": [ | |
| "Faster training and memory efficiency", | |
| "One-hot encoding automatically expands features", | |
| "Improved interpretability", | |
| "Model regularization" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Native categorical handling avoids one-hot expansion, leading to faster and more efficient training." | |
| }, | |
| { | |
| "id": 47, | |
| "questionText": "Which LightGBM boosting type is best for highly imbalanced datasets?", | |
| "options": [ | |
| "goss", | |
| "gbdt", | |
| "dart", | |
| "rf" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "GOSS (Gradient-based One-Side Sampling) is efficient and works well for imbalanced datasets by focusing on large-gradient samples." | |
| }, | |
| { | |
| "id": 48, | |
| "questionText": "Scenario: LightGBM shows slightly worse accuracy than XGBoost. Which parameter tuning may help?", | |
| "options": [ | |
| "Reduce learning_rate and increase n_estimators", | |
| "Increase feature_fraction", | |
| "Reduce num_leaves", | |
| "Disable histogram optimization" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Decreasing learning_rate while increasing n_estimators allows more refined learning and may improve accuracy." | |
| }, | |
| { | |
| "id": 49, | |
| "questionText": "Which LightGBM metric should you use for binary classification?", | |
| "options": [ | |
| "binary_logloss", | |
| "l2", | |
| "mae", | |
| "multi_logloss" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "binary_logloss measures the log loss for binary classification tasks." | |
| }, | |
| { | |
| "id": 50, | |
| "questionText": "Scenario: You want to save training time without losing much performance. Which parameters can be combined?", | |
| "options": [ | |
| "Use smaller max_bin, bagging_fraction, and feature_fraction", | |
| "Increase num_leaves and n_estimators", | |
| "Reduce learning_rate only", | |
| "Disable regularization terms" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reducing max_bin, bagging_fraction, and feature_fraction reduces training cost while maintaining accuracy." | |
| } | |
| ] | |
| } | |
| { | |
| "title": "LightGBM Mastery: Part 3 (Q51–75)", | |
| "description": "Intermediate to advanced questions (51–75) exploring LightGBM regularization, overfitting control, advanced parameters, and real-world optimization scenarios.", | |
| "questions": [ | |
| { | |
| "id": 51, | |
| "questionText": "What is the function of lambda_l1 in LightGBM?", | |
| "options": [ | |
| "Applies L1 regularization to leaf weights", | |
| "Limits number of leaves per tree", | |
| "Determines the learning rate", | |
| "Applies L2 regularization" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "lambda_l1 controls L1 regularization on leaf weights to reduce overfitting." | |
| }, | |
| { | |
| "id": 52, | |
| "questionText": "What is the function of lambda_l2 in LightGBM?", | |
| "options": [ | |
| "Applies L2 regularization to leaf weights", | |
| "Controls learning rate", | |
| "Reduces feature fraction", | |
| "Prunes shallow trees" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "lambda_l2 applies L2 regularization on leaf weights to smooth large values and improve generalization." | |
| }, | |
| { | |
| "id": 53, | |
| "questionText": "Scenario: You notice LightGBM overfits heavily. Which parameters could help?", | |
| "options": [ | |
| "Decrease num_leaves, increase min_data_in_leaf", | |
| "Increase num_leaves, lower min_data_in_leaf", | |
| "Increase learning rate", | |
| "Increase max_depth only" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reducing num_leaves and increasing min_data_in_leaf makes trees simpler and prevents overfitting." | |
| }, | |
| { | |
| "id": 54, | |
| "questionText": "What does 'min_split_gain' control in LightGBM?", | |
| "options": [ | |
| "Minimum gain required to perform a split", | |
| "Maximum gain per leaf", | |
| "Learning rate adjustment", | |
| "Subsample fraction" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "min_split_gain sets a threshold for information gain; splits below this are ignored." | |
| }, | |
| { | |
| "id": 55, | |
| "questionText": "Which LightGBM parameter controls row sampling per iteration?", | |
| "options": [ | |
| "bagging_fraction", | |
| "feature_fraction", | |
| "num_leaves", | |
| "max_bin" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "bagging_fraction randomly samples rows for each boosting round to reduce variance." | |
| }, | |
| { | |
| "id": 56, | |
| "questionText": "What does 'feature_fraction' control in LightGBM?", | |
| "options": [ | |
| "Fraction of features used per tree", | |
| "Number of bins per feature", | |
| "Maximum depth of tree", | |
| "Feature scaling" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "feature_fraction specifies the proportion of features used to build each tree." | |
| }, | |
| { | |
| "id": 57, | |
| "questionText": "Scenario: Increasing feature_fraction improves accuracy but lowers stability. Why?", | |
| "options": [ | |
| "More features increase model variance", | |
| "Feature_fraction reduces bias", | |
| "Learning rate decreases automatically", | |
| "Tree depth is fixed" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Using more features per iteration reduces randomness and increases variance, which may reduce stability." | |
| }, | |
| { | |
| "id": 58, | |
| "questionText": "Which LightGBM parameter controls data sampling frequency?", | |
| "options": [ | |
| "bagging_freq", | |
| "num_iterations", | |
| "min_child_samples", | |
| "max_depth" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "bagging_freq controls how often (in iterations) row subsampling is performed." | |
| }, | |
| { | |
| "id": 59, | |
| "questionText": "Scenario: You set bagging_fraction=1.0 and feature_fraction=1.0. Effect?", | |
| "options": [ | |
| "No random sampling; all data and features used every iteration", | |
| "Strong regularization", | |
| "Improved generalization", | |
| "Subsampling increases variance" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Setting both to 1.0 disables random sampling, using all features and samples every iteration." | |
| }, | |
| { | |
| "id": 60, | |
| "questionText": "What does 'max_bin' affect in LightGBM?", | |
| "options": [ | |
| "Precision of feature discretization", | |
| "Learning rate", | |
| "Tree depth", | |
| "Bagging rate" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "max_bin determines how many bins each continuous feature is bucketed into for histogram-based splitting." | |
| }, | |
| { | |
| "id": 61, | |
| "questionText": "Higher max_bin values typically lead to what?", | |
| "options": [ | |
| "More precise splits but slower training", | |
| "Faster training with less precision", | |
| "More regularization", | |
| "Smaller trees" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Increasing max_bin gives more precise splits but increases memory and training time." | |
| }, | |
| { | |
| "id": 62, | |
| "questionText": "Scenario: Large dataset with limited RAM. What should you adjust?", | |
| "options": [ | |
| "Decrease max_bin and num_leaves", | |
| "Increase learning rate", | |
| "Disable histogram mode", | |
| "Increase max_depth" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reducing max_bin and num_leaves lowers memory footprint and speeds up training." | |
| }, | |
| { | |
| "id": 63, | |
| "questionText": "What is the function of 'min_data_in_leaf'?", | |
| "options": [ | |
| "Minimum number of samples required to form a leaf", | |
| "Maximum depth limit", | |
| "Learning rate controller", | |
| "Number of leaves in total" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "min_data_in_leaf ensures a minimum number of samples per leaf to avoid overfitting." | |
| }, | |
| { | |
| "id": 64, | |
| "questionText": "What happens if min_data_in_leaf is set too high?", | |
| "options": [ | |
| "Model underfits due to shallow trees", | |
| "Model overfits easily", | |
| "Learning rate decreases", | |
| "Training stops early" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Too high min_data_in_leaf makes leaves large and reduces model complexity, causing underfitting." | |
| }, | |
| { | |
| "id": 65, | |
| "questionText": "What parameter limits the maximum tree depth in LightGBM?", | |
| "options": [ | |
| "max_depth", | |
| "num_leaves", | |
| "min_data_in_leaf", | |
| "feature_fraction" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "max_depth caps how deep each tree can grow." | |
| }, | |
| { | |
| "id": 66, | |
| "questionText": "Scenario: You set max_depth=-1. What happens?", | |
| "options": [ | |
| "Tree depth is unlimited and controlled by num_leaves instead", | |
| "Training fails", | |
| "Trees become shallow automatically", | |
| "Regularization is disabled" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Setting max_depth=-1 removes explicit depth restriction; num_leaves indirectly limits complexity." | |
| }, | |
| { | |
| "id": 67, | |
| "questionText": "What is the effect of increasing num_iterations in LightGBM?", | |
| "options": [ | |
| "Model trains longer and may overfit if learning_rate is not reduced", | |
| "Model converges faster", | |
| "Less accurate model", | |
| "Shallower trees" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "More boosting iterations improve fit but can overfit unless compensated by lower learning rate." | |
| }, | |
| { | |
| "id": 68, | |
| "questionText": "Scenario: Reducing learning_rate but keeping num_iterations constant causes?", | |
| "options": [ | |
| "Underfitting, since model learns slower", | |
| "Overfitting", | |
| "Higher variance", | |
| "Deeper trees" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Low learning rate with few iterations may lead to underfitting as the model learns too slowly." | |
| }, | |
| { | |
| "id": 69, | |
| "questionText": "What is the main benefit of histogram-based decision trees in LightGBM?", | |
| "options": [ | |
| "Faster training and lower memory usage", | |
| "More precise split thresholds", | |
| "Supports only small datasets", | |
| "Improves interpretability" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Histogram-based methods speed up training by grouping continuous values into discrete bins." | |
| }, | |
| { | |
| "id": 70, | |
| "questionText": "Scenario: You increase max_bin significantly. What might happen?", | |
| "options": [ | |
| "Training slows down and may overfit", | |
| "Training speeds up", | |
| "Model ignores rare features", | |
| "Learning rate increases automatically" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Higher max_bin allows finer splits but can increase overfitting and computation time." | |
| }, | |
| { | |
| "id": 71, | |
| "questionText": "What parameter controls the number of boosting rounds?", | |
| "options": [ | |
| "num_iterations", | |
| "max_depth", | |
| "feature_fraction", | |
| "min_data_in_leaf" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "num_iterations defines the total number of boosting rounds (trees) to train." | |
| }, | |
| { | |
| "id": 72, | |
| "questionText": "Scenario: Decreasing num_iterations while keeping learning_rate fixed will usually?", | |
| "options": [ | |
| "Reduce model capacity and may underfit", | |
| "Cause overfitting", | |
| "Speed up convergence with higher accuracy", | |
| "Have no effect" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Fewer iterations reduce model capacity, leading to underfitting if learning_rate is unchanged." | |
| }, | |
| { | |
| "id": 73, | |
| "questionText": "What is the benefit of early_stopping_rounds in LightGBM?", | |
| "options": [ | |
| "Automatically halts training when validation loss stops improving", | |
| "Reduces learning rate dynamically", | |
| "Increases tree depth automatically", | |
| "Samples more features" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "early_stopping_rounds prevents overfitting by stopping when performance stops improving on validation data." | |
| }, | |
| { | |
| "id": 74, | |
| "questionText": "Which parameter combination best prevents overfitting?", | |
| "options": [ | |
| "Lower num_leaves, lower learning_rate, higher min_data_in_leaf", | |
| "Higher num_leaves, higher learning_rate", | |
| "Increase max_depth only", | |
| "Set bagging_fraction=1" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Simpler trees, smaller learning rate, and more data per leaf enhance generalization." | |
| }, | |
| { | |
| "id": 75, | |
| "questionText": "Scenario: Large data, strong overfitting, and high variance. What to do?", | |
| "options": [ | |
| "Lower num_leaves, use bagging and feature_fraction < 1", | |
| "Increase tree depth", | |
| "Raise learning rate", | |
| "Disable regularization" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Using smaller trees and random sampling helps reduce overfitting and variance." | |
| } | |
| ] | |
| } | |
| { | |
| "title": "LightGBM Mastery: Part 4 (Q76–100)", | |
| "description": "Advanced and expert-level questions (76–100) exploring LightGBM’s GPU acceleration, categorical feature encoding, distributed learning, interpretability, and fine-tuning strategies.", | |
| "questions": [ | |
| { | |
| "id": 76, | |
| "questionText": "What is one key advantage of LightGBM over XGBoost?", | |
| "options": [ | |
| "Uses leaf-wise tree growth for faster convergence", | |
| "Uses level-wise tree growth for stability", | |
| "Cannot handle large datasets", | |
| "Lacks regularization" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "LightGBM grows trees leaf-wise with depth constraints, achieving faster convergence and lower loss." | |
| }, | |
| { | |
| "id": 77, | |
| "questionText": "What happens if num_leaves is much larger than 2^max_depth?", | |
| "options": [ | |
| "Overfitting increases due to overly complex trees", | |
| "Model underfits severely", | |
| "No effect on model performance", | |
| "Training halts automatically" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Too many leaves compared to max_depth allow excessive branching, causing overfitting." | |
| }, | |
| { | |
| "id": 78, | |
| "questionText": "Scenario: You enable GPU support in LightGBM. What primary benefit is expected?", | |
| "options": [ | |
| "Faster histogram construction and split finding", | |
| "Improved accuracy", | |
| "Reduced model complexity", | |
| "Automatic regularization" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "GPU acceleration speeds up histogram building and split calculations, improving training speed." | |
| }, | |
| { | |
| "id": 79, | |
| "questionText": "Which LightGBM parameter enables GPU training?", | |
| "options": [ | |
| "device_type='gpu'", | |
| "gpu_enable=True", | |
| "use_gpu=1", | |
| "boosting_type='gpu'" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Setting device_type='gpu' tells LightGBM to use GPU resources for training." | |
| }, | |
| { | |
| "id": 80, | |
| "questionText": "What is the impact of 'boosting_type' parameter?", | |
| "options": [ | |
| "Selects the boosting algorithm (gbdt, dart, goss)", | |
| "Controls feature sampling", | |
| "Defines tree depth", | |
| "Applies learning rate decay" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "boosting_type specifies the boosting algorithm variant such as 'gbdt', 'dart', or 'goss'." | |
| }, | |
| { | |
| "id": 81, | |
| "questionText": "What is DART in LightGBM?", | |
| "options": [ | |
| "Dropouts meet Multiple Additive Regression Trees", | |
| "Distributed Automatic Regression Tree", | |
| "Dynamic Adaptive Regularized Trees", | |
| "Data Adaptive Reduction Technique" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "DART is 'Dropouts meet Multiple Additive Regression Trees', introducing dropout into boosting to reduce overfitting." | |
| }, | |
| { | |
| "id": 82, | |
| "questionText": "Scenario: Using boosting_type='goss'. What does GOSS stand for?", | |
| "options": [ | |
| "Gradient-based One-Side Sampling", | |
| "Global Overfitting Sample Selector", | |
| "Generalized Optimization for Split Search", | |
| "Gradient Optimization Sampling Strategy" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "GOSS stands for Gradient-based One-Side Sampling, reducing data processed per iteration for speed." | |
| }, | |
| { | |
| "id": 83, | |
| "questionText": "What does GOSS primarily do?", | |
| "options": [ | |
| "Keeps large-gradient samples and randomly drops small-gradient ones", | |
| "Drops large-gradient samples", | |
| "Uses all samples equally", | |
| "Increases number of trees" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "GOSS keeps high-gradient samples for training, reducing computation while preserving accuracy." | |
| }, | |
| { | |
| "id": 84, | |
| "questionText": "Scenario: Dataset has many categorical variables. What should you do?", | |
| "options": [ | |
| "Use LightGBM's built-in categorical feature support", | |
| "One-hot encode all features manually", | |
| "Convert to text data", | |
| "Ignore categorical columns" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "LightGBM natively supports categorical features through optimal split encoding without full one-hot expansion." | |
| }, | |
| { | |
| "id": 85, | |
| "questionText": "How does LightGBM handle categorical features internally?", | |
| "options": [ | |
| "Sorts categories by average target and finds best split", | |
| "Performs label encoding only", | |
| "Uses frequency encoding", | |
| "Applies hash bucketing" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "LightGBM sorts categories by their target mean to efficiently find the best split." | |
| }, | |
| { | |
| "id": 86, | |
| "questionText": "Scenario: Training time is long on large data with many features. What can help?", | |
| "options": [ | |
| "Reduce feature_fraction and bagging_fraction", | |
| "Increase num_leaves", | |
| "Disable histogram mode", | |
| "Increase max_bin drastically" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reducing feature_fraction and bagging_fraction speeds up training by using subsets of features and samples." | |
| }, | |
| { | |
| "id": 87, | |
| "questionText": "What is the role of 'max_cat_threshold'?", | |
| "options": [ | |
| "Controls maximum thresholds for categorical splits", | |
| "Limits maximum tree depth", | |
| "Sets number of categories allowed", | |
| "Defines learning rate schedule" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "max_cat_threshold limits how many thresholds LightGBM evaluates for categorical splits." | |
| }, | |
| { | |
| "id": 88, | |
| "questionText": "Scenario: Distributed LightGBM training is producing inconsistent results. Likely reason?", | |
| "options": [ | |
| "Non-deterministic data shuffling or parameter differences across nodes", | |
| "Too high learning rate", | |
| "Disabled GPU support", | |
| "Overfitting due to small num_leaves" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Different random seeds or node configurations in distributed mode can cause inconsistency." | |
| }, | |
| { | |
| "id": 89, | |
| "questionText": "What helps ensure reproducible LightGBM results?", | |
| "options": [ | |
| "Set deterministic=True and fix random_seed", | |
| "Increase bagging_fraction", | |
| "Enable GPU mode", | |
| "Reduce learning rate" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Setting deterministic=True and fixing random_seed ensures consistent results across runs." | |
| }, | |
| { | |
| "id": 90, | |
| "questionText": "Which LightGBM feature allows parallel learning across machines?", | |
| "options": [ | |
| "Distributed training mode", | |
| "Bagging", | |
| "GPU histograms", | |
| "Early stopping" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Distributed mode enables training across multiple machines using data parallelism." | |
| }, | |
| { | |
| "id": 91, | |
| "questionText": "What is the key difference between DART and standard GBDT?", | |
| "options": [ | |
| "DART randomly drops trees during training to prevent overfitting", | |
| "DART doubles learning rate dynamically", | |
| "DART uses fewer features per tree", | |
| "DART cannot perform regression tasks" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "DART introduces dropout on trees, improving regularization and generalization." | |
| }, | |
| { | |
| "id": 92, | |
| "questionText": "Scenario: Validation accuracy fluctuates heavily between iterations. Likely cause?", | |
| "options": [ | |
| "Learning rate too high or bagging too aggressive", | |
| "Too many trees", | |
| "Too few bins", | |
| "High lambda_l2" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "High learning rate or aggressive subsampling can cause instability in validation metrics." | |
| }, | |
| { | |
| "id": 93, | |
| "questionText": "What does 'linear_tree' parameter enable?", | |
| "options": [ | |
| "Adds linear models to each leaf for hybrid boosting", | |
| "Switches boosting type", | |
| "Performs polynomial regression", | |
| "Forces shallow trees" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "linear_tree enables a linear model within each leaf, combining tree and linear learning." | |
| }, | |
| { | |
| "id": 94, | |
| "questionText": "Scenario: Using linear_tree improved performance slightly but increased training time. Why?", | |
| "options": [ | |
| "Linear models per leaf require additional optimization", | |
| "Learning rate reduced automatically", | |
| "Tree structure became shallower", | |
| "Fewer bins created per feature" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Each leaf fits a small linear model, increasing training computation but often improving accuracy." | |
| }, | |
| { | |
| "id": 95, | |
| "questionText": "Which LightGBM setting improves memory efficiency on large data?", | |
| "options": [ | |
| "Use histogram pool sharing and smaller max_bin", | |
| "Increase max_depth", | |
| "Enable linear_tree", | |
| "Disable bagging" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reducing max_bin and using histogram sharing significantly lower memory usage." | |
| }, | |
| { | |
| "id": 96, | |
| "questionText": "Scenario: Feature importance shows unexpected zeros for numeric features. Why?", | |
| "options": [ | |
| "Feature was rarely used due to high correlation or low information gain", | |
| "Model error", | |
| "Bug in LightGBM", | |
| "Feature_fraction=1.0" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Highly correlated or uninformative features may never be chosen for splits, yielding zero importance." | |
| }, | |
| { | |
| "id": 97, | |
| "questionText": "What is the purpose of 'monotone_constraints'?", | |
| "options": [ | |
| "Forces model predictions to follow specified monotonic relationships with features", | |
| "Restricts tree depth", | |
| "Balances data classes", | |
| "Disables early stopping" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "monotone_constraints ensure predictions move consistently up or down with certain features." | |
| }, | |
| { | |
| "id": 98, | |
| "questionText": "Scenario: You set monotone_constraints incorrectly. Possible issue?", | |
| "options": [ | |
| "Model accuracy drops or fails to converge", | |
| "Training halts immediately", | |
| "All features are ignored", | |
| "Learning rate resets" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Wrong monotonic constraints can make optimization infeasible, harming accuracy or convergence." | |
| }, | |
| { | |
| "id": 99, | |
| "questionText": "What metric would you monitor for binary classification?", | |
| "options": [ | |
| "binary_logloss or AUC", | |
| "mean_squared_error", | |
| "poisson", | |
| "quantile" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "For binary tasks, LightGBM supports metrics like binary_logloss and AUC for evaluation." | |
| }, | |
| { | |
| "id": 100, | |
| "questionText": "Scenario: After tuning, training accuracy improves but test accuracy drops. What happened?", | |
| "options": [ | |
| "Overfitting", | |
| "Underfitting", | |
| "Learning rate too small", | |
| "Too many missing values" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Higher training accuracy with lower test performance indicates overfitting." | |
| } | |
| ] | |
| } | |