deedrop1140's picture
Upload 41 files
0d00d62 verified
{
"title": "LightGBM Mastery: Part 1 (Q1–25)",
"description": "Questions 1–25 covering LightGBM fundamentals — boosting basics, leaf-wise growth, histogram optimization, and key parameters.",
"questions": [
{
"id": 1,
"questionText": "What type of algorithm is LightGBM based on?",
"options": [
"Bagging",
"Boosting",
"Stacking",
"Voting"
],
"correctAnswerIndex": 1,
"explanation": "LightGBM is a boosting algorithm, specifically a gradient boosting framework that builds models sequentially."
},
{
"id": 2,
"questionText": "Which of the following is a unique characteristic of LightGBM?",
"options": [
"It grows trees level-wise",
"It grows trees leaf-wise",
"It uses deep neural networks",
"It averages models"
],
"correctAnswerIndex": 1,
"explanation": "LightGBM grows trees leaf-wise (best-first) to reduce loss more efficiently compared to level-wise methods."
},
{
"id": 3,
"questionText": "What type of trees does LightGBM primarily use?",
"options": [
"Shallow random trees",
"Deep neural trees",
"Decision trees",
"Regression trees"
],
"correctAnswerIndex": 3,
"explanation": "LightGBM primarily uses regression trees for both classification and regression tasks."
},
{
"id": 4,
"questionText": "Which technique allows LightGBM to handle large datasets efficiently?",
"options": [
"Feature hashing",
"Histogram-based algorithm",
"Random sampling",
"PCA compression"
],
"correctAnswerIndex": 1,
"explanation": "LightGBM uses a histogram-based algorithm to reduce computation by discretizing continuous features into bins."
},
{
"id": 5,
"questionText": "Scenario: You have very large dataset with millions of rows. Why is LightGBM preferred over XGBoost?",
"options": [
"Because it uses neural networks internally",
"Because it uses histogram-based splits and leaf-wise growth for efficiency",
"Because it reduces model interpretability",
"Because it doesn’t require gradient computation"
],
"correctAnswerIndex": 1,
"explanation": "LightGBM is optimized for large datasets using histogram-based splits and leaf-wise tree growth."
},
{
"id": 6,
"questionText": "Which LightGBM parameter controls the number of leaves in a single tree?",
"options": [
"num_leaves",
"max_depth",
"min_child_samples",
"n_estimators"
],
"correctAnswerIndex": 0,
"explanation": "The num_leaves parameter sets the maximum number of leaves in one tree, controlling model complexity."
},
{
"id": 7,
"questionText": "What happens if num_leaves is set too high?",
"options": [
"Model becomes underfit",
"Model becomes overfit",
"Model trains faster",
"Model ignores small features"
],
"correctAnswerIndex": 1,
"explanation": "Too many leaves can make the model overly complex and prone to overfitting."
},
{
"id": 8,
"questionText": "Which LightGBM parameter controls the learning rate?",
"options": [
"shrinkage_rate",
"alpha",
"learning_rate",
"lambda"
],
"correctAnswerIndex": 2,
"explanation": "The learning_rate parameter determines how much each new tree contributes to the model."
},
{
"id": 9,
"questionText": "Scenario: Model accuracy stagnates early during training. Which parameter can you increase?",
"options": [
"num_leaves",
"learning_rate",
"n_estimators",
"feature_fraction"
],
"correctAnswerIndex": 2,
"explanation": "Increasing n_estimators (number of boosting iterations) allows the model to learn longer."
},
{
"id": 10,
"questionText": "What is feature_fraction used for in LightGBM?",
"options": [
"Regularization to reduce overfitting by randomly selecting a fraction of features per tree",
"Adjusting leaf size",
"Reducing data size by sampling rows",
"Controlling the number of leaves"
],
"correctAnswerIndex": 0,
"explanation": "feature_fraction randomly selects a fraction of features to train each tree, helping regularize the model."
},
{
"id": 11,
"questionText": "Which LightGBM parameter limits tree depth?",
"options": [
"max_depth",
"num_leaves",
"min_split_gain",
"subsample"
],
"correctAnswerIndex": 0,
"explanation": "max_depth directly limits how deep trees can grow, preventing overfitting."
},
{
"id": 12,
"questionText": "What does min_child_samples control?",
"options": [
"Minimum number of samples required in a leaf",
"Minimum number of features used in a tree",
"Minimum iterations before early stopping",
"Minimum value for learning rate"
],
"correctAnswerIndex": 0,
"explanation": "min_child_samples ensures that leaves have enough data points, acting as a regularization technique."
},
{
"id": 13,
"questionText": "Scenario: LightGBM model is overfitting. Which change helps reduce it?",
"options": [
"Decrease num_leaves",
"Increase num_leaves",
"Increase learning rate",
"Remove regularization"
],
"correctAnswerIndex": 0,
"explanation": "Reducing num_leaves decreases model complexity and helps combat overfitting."
},
{
"id": 14,
"questionText": "What does boosting_type='dart' mean in LightGBM?",
"options": [
"It uses Dropouts meet Multiple Additive Regression Trees",
"It disables boosting",
"It performs bagging only",
"It builds random forests"
],
"correctAnswerIndex": 0,
"explanation": "The DART variant of LightGBM randomly drops trees during boosting to improve generalization."
},
{
"id": 15,
"questionText": "Which LightGBM boosting type uses dropouts for regularization?",
"options": [
"gbdt",
"dart",
"goss",
"rf"
],
"correctAnswerIndex": 1,
"explanation": "The DART boosting type introduces dropout in boosting to prevent overfitting."
},
{
"id": 16,
"questionText": "What does goss stand for in LightGBM?",
"options": [
"Gradient-based One-Side Sampling",
"Gradient Optimization Sampling System",
"Global Outlier Sampling Strategy",
"Generalized Optimization Split Search"
],
"correctAnswerIndex": 0,
"explanation": "GOSS is Gradient-based One-Side Sampling — a LightGBM optimization that speeds up training by sampling instances with large gradients."
},
{
"id": 17,
"questionText": "Which LightGBM parameter helps in row subsampling?",
"options": [
"bagging_fraction",
"feature_fraction",
"lambda_l1",
"min_split_gain"
],
"correctAnswerIndex": 0,
"explanation": "bagging_fraction controls the fraction of data used per iteration, providing row-wise subsampling."
},
{
"id": 18,
"questionText": "Scenario: You want faster training but can tolerate a small loss in accuracy. Which parameter can you reduce?",
"options": [
"bagging_fraction",
"num_leaves",
"max_depth",
"feature_fraction"
],
"correctAnswerIndex": 0,
"explanation": "Reducing bagging_fraction increases speed by using fewer data rows per iteration."
},
{
"id": 19,
"questionText": "What does lambda_l1 control in LightGBM?",
"options": [
"L1 regularization term on weights",
"L2 regularization term on weights",
"Dropout rate",
"Learning rate decay"
],
"correctAnswerIndex": 0,
"explanation": "lambda_l1 adds L1 regularization on leaf weights to encourage sparsity and reduce overfitting."
},
{
"id": 20,
"questionText": "Which LightGBM regularization term penalizes large leaf weights using L2 norm?",
"options": [
"lambda_l1",
"lambda_l2",
"min_child_samples",
"feature_fraction"
],
"correctAnswerIndex": 1,
"explanation": "lambda_l2 applies L2 regularization to prevent large leaf weights and stabilize training."
},
{
"id": 21,
"questionText": "Scenario: Model accuracy is fluctuating during boosting. Which parameter helps smooth this effect?",
"options": [
"learning_rate",
"num_leaves",
"min_gain_to_split",
"bagging_freq"
],
"correctAnswerIndex": 0,
"explanation": "A smaller learning_rate helps stabilize model updates, reducing fluctuations."
},
{
"id": 22,
"questionText": "What is the role of min_gain_to_split?",
"options": [
"Minimum loss reduction required for a split",
"Minimum number of leaves required per tree",
"Maximum number of features allowed",
"Learning rate decay factor"
],
"correctAnswerIndex": 0,
"explanation": "min_gain_to_split prevents small, insignificant splits by requiring a minimum loss reduction."
},
{
"id": 23,
"questionText": "Scenario: Dataset contains categorical variables. How does LightGBM handle them efficiently?",
"options": [
"Using one-hot encoding automatically",
"By internally converting them using optimal split algorithms",
"By ignoring categorical variables",
"By treating them as numeric values directly"
],
"correctAnswerIndex": 1,
"explanation": "LightGBM natively supports categorical features by finding optimal split points without full one-hot encoding."
},
{
"id": 24,
"questionText": "Which parameter in LightGBM is used to handle categorical features?",
"options": [
"categorical_feature",
"cat_var",
"cat_split",
"categorical_index"
],
"correctAnswerIndex": 0,
"explanation": "The categorical_feature parameter specifies which columns are treated as categorical during training."
},
{
"id": 25,
"questionText": "Scenario: LightGBM is using GPU for training. Which advantage does this offer?",
"options": [
"Faster histogram construction and split finding",
"Automatic feature engineering",
"Better interpretability",
"Improved regularization"
],
"correctAnswerIndex": 0,
"explanation": "GPU acceleration speeds up histogram creation and split computations, significantly reducing training time."
}
]
}
{
"title": "LightGBM Mastery: Part 2 (Q26–50)",
"description": "Questions 26–50 exploring LightGBM tuning — sampling, regularization, parameter interactions, parallelization, and practical training strategies.",
"questions": [
{
"id": 26,
"questionText": "What does the bagging_freq parameter control in LightGBM?",
"options": [
"How frequently bagging is performed during training",
"Number of features to drop per tree",
"Learning rate schedule frequency",
"Regularization update interval"
],
"correctAnswerIndex": 0,
"explanation": "bagging_freq determines after how many boosting iterations LightGBM performs row subsampling."
},
{
"id": 27,
"questionText": "Which LightGBM optimization helps in handling large-scale datasets efficiently?",
"options": [
"Histogram-based binning",
"Deep tree expansion",
"Dynamic pruning",
"Recurrent boosting"
],
"correctAnswerIndex": 0,
"explanation": "LightGBM uses histogram-based binning to reduce computation and memory usage for large datasets."
},
{
"id": 28,
"questionText": "Scenario: Dataset contains extreme class imbalance. Which parameter helps mitigate it?",
"options": [
"scale_pos_weight",
"bagging_fraction",
"num_leaves",
"min_child_samples"
],
"correctAnswerIndex": 0,
"explanation": "scale_pos_weight adjusts the relative weight of positive samples to handle imbalanced datasets effectively."
},
{
"id": 29,
"questionText": "Which LightGBM setting should be increased to make the model less sensitive to noise?",
"options": [
"min_child_samples",
"num_leaves",
"learning_rate",
"max_depth"
],
"correctAnswerIndex": 0,
"explanation": "Increasing min_child_samples ensures leaves contain more data points, making the model more robust to noise."
},
{
"id": 30,
"questionText": "What is the role of early_stopping_round in LightGBM training?",
"options": [
"Stops training when validation loss does not improve after a certain number of rounds",
"Reduces learning rate automatically",
"Saves best iteration for retraining",
"Increases number of leaves gradually"
],
"correctAnswerIndex": 0,
"explanation": "early_stopping_round halts training if performance on validation data stops improving."
},
{
"id": 31,
"questionText": "Scenario: You observe that LightGBM trains very fast but underfits. What adjustment helps?",
"options": [
"Increase num_leaves or n_estimators",
"Decrease learning rate",
"Reduce max_depth",
"Reduce feature_fraction"
],
"correctAnswerIndex": 0,
"explanation": "Increasing num_leaves or n_estimators allows the model to capture more complexity and reduce underfitting."
},
{
"id": 32,
"questionText": "What does LightGBM’s leaf-wise tree growth mean?",
"options": [
"It splits the leaf with the highest loss reduction first",
"It splits all leaves at the same level simultaneously",
"It grows the tree symmetrically",
"It uses fixed depth trees"
],
"correctAnswerIndex": 0,
"explanation": "Leaf-wise growth selects and splits the leaf that gives the greatest loss reduction, leading to faster convergence."
},
{
"id": 33,
"questionText": "Which parameter combination most affects model complexity?",
"options": [
"num_leaves and max_depth",
"feature_fraction and bagging_fraction",
"learning_rate and n_estimators",
"lambda_l1 and lambda_l2"
],
"correctAnswerIndex": 0,
"explanation": "num_leaves and max_depth jointly control tree structure and hence the complexity of the model."
},
{
"id": 34,
"questionText": "Scenario: LightGBM runs out of memory on a massive dataset. Which setting helps reduce memory usage?",
"options": [
"Reduce max_bin",
"Increase learning_rate",
"Set boosting_type to dart",
"Increase num_leaves"
],
"correctAnswerIndex": 0,
"explanation": "Reducing max_bin decreases the number of histogram bins, lowering memory requirements."
},
{
"id": 35,
"questionText": "What does the parameter max_bin represent in LightGBM?",
"options": [
"Maximum number of bins to bucket continuous features",
"Maximum number of leaves per tree",
"Maximum depth of trees",
"Maximum iterations for convergence"
],
"correctAnswerIndex": 0,
"explanation": "max_bin determines how many discrete bins each feature will be divided into during histogram building."
},
{
"id": 36,
"questionText": "Scenario: Model training takes too long. Which adjustment improves speed most effectively?",
"options": [
"Reduce max_bin or use bagging_fraction < 1",
"Increase num_leaves",
"Reduce learning_rate only",
"Increase regularization terms"
],
"correctAnswerIndex": 0,
"explanation": "Reducing max_bin or using smaller bagging_fraction reduces the dataset processed each iteration, speeding up training."
},
{
"id": 37,
"questionText": "What is the primary drawback of leaf-wise tree growth?",
"options": [
"Higher risk of overfitting on small data",
"Slower convergence",
"Worse performance on large datasets",
"Poor categorical handling"
],
"correctAnswerIndex": 0,
"explanation": "Leaf-wise growth can overfit on small datasets because it may produce very deep trees."
},
{
"id": 38,
"questionText": "Which LightGBM parameter defines how many bins are created for each feature?",
"options": [
"max_bin",
"num_leaves",
"feature_fraction",
"max_depth"
],
"correctAnswerIndex": 0,
"explanation": "max_bin sets how finely continuous features are bucketed into discrete bins for histogram-based learning."
},
{
"id": 39,
"questionText": "What type of regularization do lambda_l1 and lambda_l2 correspond to?",
"options": [
"Lasso and Ridge regularization",
"Elastic Net regularization",
"Dropout regularization",
"Tree pruning regularization"
],
"correctAnswerIndex": 0,
"explanation": "lambda_l1 and lambda_l2 implement Lasso (L1) and Ridge (L2) regularization respectively."
},
{
"id": 40,
"questionText": "Scenario: You observe overfitting with high validation error. Which parameters help reduce it?",
"options": [
"Increase min_child_samples, decrease num_leaves",
"Increase learning_rate, increase num_leaves",
"Reduce lambda_l2",
"Increase feature_fraction"
],
"correctAnswerIndex": 0,
"explanation": "Increasing min_child_samples and reducing num_leaves simplify the model, reducing overfitting."
},
{
"id": 41,
"questionText": "What is the role of monotone_constraints in LightGBM?",
"options": [
"Ensure certain features have monotonic relationships with the target",
"Enforce equal feature importance",
"Reduce overfitting using L2 regularization",
"Apply monotonic normalization to inputs"
],
"correctAnswerIndex": 0,
"explanation": "monotone_constraints force LightGBM to maintain a monotonic relationship for specific features."
},
{
"id": 42,
"questionText": "Scenario: You want reproducible results from LightGBM training. Which parameter helps?",
"options": [
"random_state",
"seed",
"boosting_seed",
"Any of the above"
],
"correctAnswerIndex": 3,
"explanation": "Setting seed or random_state ensures deterministic behavior in LightGBM training."
},
{
"id": 43,
"questionText": "Which LightGBM parameter defines the objective function?",
"options": [
"objective",
"metric",
"boosting_type",
"learning_rate"
],
"correctAnswerIndex": 0,
"explanation": "The objective parameter defines the loss function that LightGBM optimizes, e.g., 'binary', 'regression'."
},
{
"id": 44,
"questionText": "Scenario: You are using LightGBM for multi-class classification. What should the objective be set to?",
"options": [
"multiclass",
"multiclassova",
"binary",
"regression"
],
"correctAnswerIndex": 0,
"explanation": "For multi-class classification, objective='multiclass' should be used with num_class specified."
},
{
"id": 45,
"questionText": "What does feature_pre_filter in LightGBM control?",
"options": [
"Whether features are pre-screened before training",
"Feature normalization",
"Automatic feature selection during training",
"Dropout of low importance features"
],
"correctAnswerIndex": 0,
"explanation": "feature_pre_filter determines if LightGBM filters out constant or low-variance features before training."
},
{
"id": 46,
"questionText": "Scenario: You use categorical features in LightGBM. What advantage does native support provide?",
"options": [
"Faster training and memory efficiency",
"One-hot encoding automatically expands features",
"Improved interpretability",
"Model regularization"
],
"correctAnswerIndex": 0,
"explanation": "Native categorical handling avoids one-hot expansion, leading to faster and more efficient training."
},
{
"id": 47,
"questionText": "Which LightGBM boosting type is best for highly imbalanced datasets?",
"options": [
"goss",
"gbdt",
"dart",
"rf"
],
"correctAnswerIndex": 0,
"explanation": "GOSS (Gradient-based One-Side Sampling) is efficient and works well for imbalanced datasets by focusing on large-gradient samples."
},
{
"id": 48,
"questionText": "Scenario: LightGBM shows slightly worse accuracy than XGBoost. Which parameter tuning may help?",
"options": [
"Reduce learning_rate and increase n_estimators",
"Increase feature_fraction",
"Reduce num_leaves",
"Disable histogram optimization"
],
"correctAnswerIndex": 0,
"explanation": "Decreasing learning_rate while increasing n_estimators allows more refined learning and may improve accuracy."
},
{
"id": 49,
"questionText": "Which LightGBM metric should you use for binary classification?",
"options": [
"binary_logloss",
"l2",
"mae",
"multi_logloss"
],
"correctAnswerIndex": 0,
"explanation": "binary_logloss measures the log loss for binary classification tasks."
},
{
"id": 50,
"questionText": "Scenario: You want to save training time without losing much performance. Which parameters can be combined?",
"options": [
"Use smaller max_bin, bagging_fraction, and feature_fraction",
"Increase num_leaves and n_estimators",
"Reduce learning_rate only",
"Disable regularization terms"
],
"correctAnswerIndex": 0,
"explanation": "Reducing max_bin, bagging_fraction, and feature_fraction reduces training cost while maintaining accuracy."
}
]
}
{
"title": "LightGBM Mastery: Part 3 (Q51–75)",
"description": "Intermediate to advanced questions (51–75) exploring LightGBM regularization, overfitting control, advanced parameters, and real-world optimization scenarios.",
"questions": [
{
"id": 51,
"questionText": "What is the function of lambda_l1 in LightGBM?",
"options": [
"Applies L1 regularization to leaf weights",
"Limits number of leaves per tree",
"Determines the learning rate",
"Applies L2 regularization"
],
"correctAnswerIndex": 0,
"explanation": "lambda_l1 controls L1 regularization on leaf weights to reduce overfitting."
},
{
"id": 52,
"questionText": "What is the function of lambda_l2 in LightGBM?",
"options": [
"Applies L2 regularization to leaf weights",
"Controls learning rate",
"Reduces feature fraction",
"Prunes shallow trees"
],
"correctAnswerIndex": 0,
"explanation": "lambda_l2 applies L2 regularization on leaf weights to smooth large values and improve generalization."
},
{
"id": 53,
"questionText": "Scenario: You notice LightGBM overfits heavily. Which parameters could help?",
"options": [
"Decrease num_leaves, increase min_data_in_leaf",
"Increase num_leaves, lower min_data_in_leaf",
"Increase learning rate",
"Increase max_depth only"
],
"correctAnswerIndex": 0,
"explanation": "Reducing num_leaves and increasing min_data_in_leaf makes trees simpler and prevents overfitting."
},
{
"id": 54,
"questionText": "What does 'min_split_gain' control in LightGBM?",
"options": [
"Minimum gain required to perform a split",
"Maximum gain per leaf",
"Learning rate adjustment",
"Subsample fraction"
],
"correctAnswerIndex": 0,
"explanation": "min_split_gain sets a threshold for information gain; splits below this are ignored."
},
{
"id": 55,
"questionText": "Which LightGBM parameter controls row sampling per iteration?",
"options": [
"bagging_fraction",
"feature_fraction",
"num_leaves",
"max_bin"
],
"correctAnswerIndex": 0,
"explanation": "bagging_fraction randomly samples rows for each boosting round to reduce variance."
},
{
"id": 56,
"questionText": "What does 'feature_fraction' control in LightGBM?",
"options": [
"Fraction of features used per tree",
"Number of bins per feature",
"Maximum depth of tree",
"Feature scaling"
],
"correctAnswerIndex": 0,
"explanation": "feature_fraction specifies the proportion of features used to build each tree."
},
{
"id": 57,
"questionText": "Scenario: Increasing feature_fraction improves accuracy but lowers stability. Why?",
"options": [
"More features increase model variance",
"Feature_fraction reduces bias",
"Learning rate decreases automatically",
"Tree depth is fixed"
],
"correctAnswerIndex": 0,
"explanation": "Using more features per iteration reduces randomness and increases variance, which may reduce stability."
},
{
"id": 58,
"questionText": "Which LightGBM parameter controls data sampling frequency?",
"options": [
"bagging_freq",
"num_iterations",
"min_child_samples",
"max_depth"
],
"correctAnswerIndex": 0,
"explanation": "bagging_freq controls how often (in iterations) row subsampling is performed."
},
{
"id": 59,
"questionText": "Scenario: You set bagging_fraction=1.0 and feature_fraction=1.0. Effect?",
"options": [
"No random sampling; all data and features used every iteration",
"Strong regularization",
"Improved generalization",
"Subsampling increases variance"
],
"correctAnswerIndex": 0,
"explanation": "Setting both to 1.0 disables random sampling, using all features and samples every iteration."
},
{
"id": 60,
"questionText": "What does 'max_bin' affect in LightGBM?",
"options": [
"Precision of feature discretization",
"Learning rate",
"Tree depth",
"Bagging rate"
],
"correctAnswerIndex": 0,
"explanation": "max_bin determines how many bins each continuous feature is bucketed into for histogram-based splitting."
},
{
"id": 61,
"questionText": "Higher max_bin values typically lead to what?",
"options": [
"More precise splits but slower training",
"Faster training with less precision",
"More regularization",
"Smaller trees"
],
"correctAnswerIndex": 0,
"explanation": "Increasing max_bin gives more precise splits but increases memory and training time."
},
{
"id": 62,
"questionText": "Scenario: Large dataset with limited RAM. What should you adjust?",
"options": [
"Decrease max_bin and num_leaves",
"Increase learning rate",
"Disable histogram mode",
"Increase max_depth"
],
"correctAnswerIndex": 0,
"explanation": "Reducing max_bin and num_leaves lowers memory footprint and speeds up training."
},
{
"id": 63,
"questionText": "What is the function of 'min_data_in_leaf'?",
"options": [
"Minimum number of samples required to form a leaf",
"Maximum depth limit",
"Learning rate controller",
"Number of leaves in total"
],
"correctAnswerIndex": 0,
"explanation": "min_data_in_leaf ensures a minimum number of samples per leaf to avoid overfitting."
},
{
"id": 64,
"questionText": "What happens if min_data_in_leaf is set too high?",
"options": [
"Model underfits due to shallow trees",
"Model overfits easily",
"Learning rate decreases",
"Training stops early"
],
"correctAnswerIndex": 0,
"explanation": "Too high min_data_in_leaf makes leaves large and reduces model complexity, causing underfitting."
},
{
"id": 65,
"questionText": "What parameter limits the maximum tree depth in LightGBM?",
"options": [
"max_depth",
"num_leaves",
"min_data_in_leaf",
"feature_fraction"
],
"correctAnswerIndex": 0,
"explanation": "max_depth caps how deep each tree can grow."
},
{
"id": 66,
"questionText": "Scenario: You set max_depth=-1. What happens?",
"options": [
"Tree depth is unlimited and controlled by num_leaves instead",
"Training fails",
"Trees become shallow automatically",
"Regularization is disabled"
],
"correctAnswerIndex": 0,
"explanation": "Setting max_depth=-1 removes explicit depth restriction; num_leaves indirectly limits complexity."
},
{
"id": 67,
"questionText": "What is the effect of increasing num_iterations in LightGBM?",
"options": [
"Model trains longer and may overfit if learning_rate is not reduced",
"Model converges faster",
"Less accurate model",
"Shallower trees"
],
"correctAnswerIndex": 0,
"explanation": "More boosting iterations improve fit but can overfit unless compensated by lower learning rate."
},
{
"id": 68,
"questionText": "Scenario: Reducing learning_rate but keeping num_iterations constant causes?",
"options": [
"Underfitting, since model learns slower",
"Overfitting",
"Higher variance",
"Deeper trees"
],
"correctAnswerIndex": 0,
"explanation": "Low learning rate with few iterations may lead to underfitting as the model learns too slowly."
},
{
"id": 69,
"questionText": "What is the main benefit of histogram-based decision trees in LightGBM?",
"options": [
"Faster training and lower memory usage",
"More precise split thresholds",
"Supports only small datasets",
"Improves interpretability"
],
"correctAnswerIndex": 0,
"explanation": "Histogram-based methods speed up training by grouping continuous values into discrete bins."
},
{
"id": 70,
"questionText": "Scenario: You increase max_bin significantly. What might happen?",
"options": [
"Training slows down and may overfit",
"Training speeds up",
"Model ignores rare features",
"Learning rate increases automatically"
],
"correctAnswerIndex": 0,
"explanation": "Higher max_bin allows finer splits but can increase overfitting and computation time."
},
{
"id": 71,
"questionText": "What parameter controls the number of boosting rounds?",
"options": [
"num_iterations",
"max_depth",
"feature_fraction",
"min_data_in_leaf"
],
"correctAnswerIndex": 0,
"explanation": "num_iterations defines the total number of boosting rounds (trees) to train."
},
{
"id": 72,
"questionText": "Scenario: Decreasing num_iterations while keeping learning_rate fixed will usually?",
"options": [
"Reduce model capacity and may underfit",
"Cause overfitting",
"Speed up convergence with higher accuracy",
"Have no effect"
],
"correctAnswerIndex": 0,
"explanation": "Fewer iterations reduce model capacity, leading to underfitting if learning_rate is unchanged."
},
{
"id": 73,
"questionText": "What is the benefit of early_stopping_rounds in LightGBM?",
"options": [
"Automatically halts training when validation loss stops improving",
"Reduces learning rate dynamically",
"Increases tree depth automatically",
"Samples more features"
],
"correctAnswerIndex": 0,
"explanation": "early_stopping_rounds prevents overfitting by stopping when performance stops improving on validation data."
},
{
"id": 74,
"questionText": "Which parameter combination best prevents overfitting?",
"options": [
"Lower num_leaves, lower learning_rate, higher min_data_in_leaf",
"Higher num_leaves, higher learning_rate",
"Increase max_depth only",
"Set bagging_fraction=1"
],
"correctAnswerIndex": 0,
"explanation": "Simpler trees, smaller learning rate, and more data per leaf enhance generalization."
},
{
"id": 75,
"questionText": "Scenario: Large data, strong overfitting, and high variance. What to do?",
"options": [
"Lower num_leaves, use bagging and feature_fraction < 1",
"Increase tree depth",
"Raise learning rate",
"Disable regularization"
],
"correctAnswerIndex": 0,
"explanation": "Using smaller trees and random sampling helps reduce overfitting and variance."
}
]
}
{
"title": "LightGBM Mastery: Part 4 (Q76–100)",
"description": "Advanced and expert-level questions (76–100) exploring LightGBM’s GPU acceleration, categorical feature encoding, distributed learning, interpretability, and fine-tuning strategies.",
"questions": [
{
"id": 76,
"questionText": "What is one key advantage of LightGBM over XGBoost?",
"options": [
"Uses leaf-wise tree growth for faster convergence",
"Uses level-wise tree growth for stability",
"Cannot handle large datasets",
"Lacks regularization"
],
"correctAnswerIndex": 0,
"explanation": "LightGBM grows trees leaf-wise with depth constraints, achieving faster convergence and lower loss."
},
{
"id": 77,
"questionText": "What happens if num_leaves is much larger than 2^max_depth?",
"options": [
"Overfitting increases due to overly complex trees",
"Model underfits severely",
"No effect on model performance",
"Training halts automatically"
],
"correctAnswerIndex": 0,
"explanation": "Too many leaves compared to max_depth allow excessive branching, causing overfitting."
},
{
"id": 78,
"questionText": "Scenario: You enable GPU support in LightGBM. What primary benefit is expected?",
"options": [
"Faster histogram construction and split finding",
"Improved accuracy",
"Reduced model complexity",
"Automatic regularization"
],
"correctAnswerIndex": 0,
"explanation": "GPU acceleration speeds up histogram building and split calculations, improving training speed."
},
{
"id": 79,
"questionText": "Which LightGBM parameter enables GPU training?",
"options": [
"device_type='gpu'",
"gpu_enable=True",
"use_gpu=1",
"boosting_type='gpu'"
],
"correctAnswerIndex": 0,
"explanation": "Setting device_type='gpu' tells LightGBM to use GPU resources for training."
},
{
"id": 80,
"questionText": "What is the impact of 'boosting_type' parameter?",
"options": [
"Selects the boosting algorithm (gbdt, dart, goss)",
"Controls feature sampling",
"Defines tree depth",
"Applies learning rate decay"
],
"correctAnswerIndex": 0,
"explanation": "boosting_type specifies the boosting algorithm variant such as 'gbdt', 'dart', or 'goss'."
},
{
"id": 81,
"questionText": "What is DART in LightGBM?",
"options": [
"Dropouts meet Multiple Additive Regression Trees",
"Distributed Automatic Regression Tree",
"Dynamic Adaptive Regularized Trees",
"Data Adaptive Reduction Technique"
],
"correctAnswerIndex": 0,
"explanation": "DART is 'Dropouts meet Multiple Additive Regression Trees', introducing dropout into boosting to reduce overfitting."
},
{
"id": 82,
"questionText": "Scenario: Using boosting_type='goss'. What does GOSS stand for?",
"options": [
"Gradient-based One-Side Sampling",
"Global Overfitting Sample Selector",
"Generalized Optimization for Split Search",
"Gradient Optimization Sampling Strategy"
],
"correctAnswerIndex": 0,
"explanation": "GOSS stands for Gradient-based One-Side Sampling, reducing data processed per iteration for speed."
},
{
"id": 83,
"questionText": "What does GOSS primarily do?",
"options": [
"Keeps large-gradient samples and randomly drops small-gradient ones",
"Drops large-gradient samples",
"Uses all samples equally",
"Increases number of trees"
],
"correctAnswerIndex": 0,
"explanation": "GOSS keeps high-gradient samples for training, reducing computation while preserving accuracy."
},
{
"id": 84,
"questionText": "Scenario: Dataset has many categorical variables. What should you do?",
"options": [
"Use LightGBM's built-in categorical feature support",
"One-hot encode all features manually",
"Convert to text data",
"Ignore categorical columns"
],
"correctAnswerIndex": 0,
"explanation": "LightGBM natively supports categorical features through optimal split encoding without full one-hot expansion."
},
{
"id": 85,
"questionText": "How does LightGBM handle categorical features internally?",
"options": [
"Sorts categories by average target and finds best split",
"Performs label encoding only",
"Uses frequency encoding",
"Applies hash bucketing"
],
"correctAnswerIndex": 0,
"explanation": "LightGBM sorts categories by their target mean to efficiently find the best split."
},
{
"id": 86,
"questionText": "Scenario: Training time is long on large data with many features. What can help?",
"options": [
"Reduce feature_fraction and bagging_fraction",
"Increase num_leaves",
"Disable histogram mode",
"Increase max_bin drastically"
],
"correctAnswerIndex": 0,
"explanation": "Reducing feature_fraction and bagging_fraction speeds up training by using subsets of features and samples."
},
{
"id": 87,
"questionText": "What is the role of 'max_cat_threshold'?",
"options": [
"Controls maximum thresholds for categorical splits",
"Limits maximum tree depth",
"Sets number of categories allowed",
"Defines learning rate schedule"
],
"correctAnswerIndex": 0,
"explanation": "max_cat_threshold limits how many thresholds LightGBM evaluates for categorical splits."
},
{
"id": 88,
"questionText": "Scenario: Distributed LightGBM training is producing inconsistent results. Likely reason?",
"options": [
"Non-deterministic data shuffling or parameter differences across nodes",
"Too high learning rate",
"Disabled GPU support",
"Overfitting due to small num_leaves"
],
"correctAnswerIndex": 0,
"explanation": "Different random seeds or node configurations in distributed mode can cause inconsistency."
},
{
"id": 89,
"questionText": "What helps ensure reproducible LightGBM results?",
"options": [
"Set deterministic=True and fix random_seed",
"Increase bagging_fraction",
"Enable GPU mode",
"Reduce learning rate"
],
"correctAnswerIndex": 0,
"explanation": "Setting deterministic=True and fixing random_seed ensures consistent results across runs."
},
{
"id": 90,
"questionText": "Which LightGBM feature allows parallel learning across machines?",
"options": [
"Distributed training mode",
"Bagging",
"GPU histograms",
"Early stopping"
],
"correctAnswerIndex": 0,
"explanation": "Distributed mode enables training across multiple machines using data parallelism."
},
{
"id": 91,
"questionText": "What is the key difference between DART and standard GBDT?",
"options": [
"DART randomly drops trees during training to prevent overfitting",
"DART doubles learning rate dynamically",
"DART uses fewer features per tree",
"DART cannot perform regression tasks"
],
"correctAnswerIndex": 0,
"explanation": "DART introduces dropout on trees, improving regularization and generalization."
},
{
"id": 92,
"questionText": "Scenario: Validation accuracy fluctuates heavily between iterations. Likely cause?",
"options": [
"Learning rate too high or bagging too aggressive",
"Too many trees",
"Too few bins",
"High lambda_l2"
],
"correctAnswerIndex": 0,
"explanation": "High learning rate or aggressive subsampling can cause instability in validation metrics."
},
{
"id": 93,
"questionText": "What does 'linear_tree' parameter enable?",
"options": [
"Adds linear models to each leaf for hybrid boosting",
"Switches boosting type",
"Performs polynomial regression",
"Forces shallow trees"
],
"correctAnswerIndex": 0,
"explanation": "linear_tree enables a linear model within each leaf, combining tree and linear learning."
},
{
"id": 94,
"questionText": "Scenario: Using linear_tree improved performance slightly but increased training time. Why?",
"options": [
"Linear models per leaf require additional optimization",
"Learning rate reduced automatically",
"Tree structure became shallower",
"Fewer bins created per feature"
],
"correctAnswerIndex": 0,
"explanation": "Each leaf fits a small linear model, increasing training computation but often improving accuracy."
},
{
"id": 95,
"questionText": "Which LightGBM setting improves memory efficiency on large data?",
"options": [
"Use histogram pool sharing and smaller max_bin",
"Increase max_depth",
"Enable linear_tree",
"Disable bagging"
],
"correctAnswerIndex": 0,
"explanation": "Reducing max_bin and using histogram sharing significantly lower memory usage."
},
{
"id": 96,
"questionText": "Scenario: Feature importance shows unexpected zeros for numeric features. Why?",
"options": [
"Feature was rarely used due to high correlation or low information gain",
"Model error",
"Bug in LightGBM",
"Feature_fraction=1.0"
],
"correctAnswerIndex": 0,
"explanation": "Highly correlated or uninformative features may never be chosen for splits, yielding zero importance."
},
{
"id": 97,
"questionText": "What is the purpose of 'monotone_constraints'?",
"options": [
"Forces model predictions to follow specified monotonic relationships with features",
"Restricts tree depth",
"Balances data classes",
"Disables early stopping"
],
"correctAnswerIndex": 0,
"explanation": "monotone_constraints ensure predictions move consistently up or down with certain features."
},
{
"id": 98,
"questionText": "Scenario: You set monotone_constraints incorrectly. Possible issue?",
"options": [
"Model accuracy drops or fails to converge",
"Training halts immediately",
"All features are ignored",
"Learning rate resets"
],
"correctAnswerIndex": 0,
"explanation": "Wrong monotonic constraints can make optimization infeasible, harming accuracy or convergence."
},
{
"id": 99,
"questionText": "What metric would you monitor for binary classification?",
"options": [
"binary_logloss or AUC",
"mean_squared_error",
"poisson",
"quantile"
],
"correctAnswerIndex": 0,
"explanation": "For binary tasks, LightGBM supports metrics like binary_logloss and AUC for evaluation."
},
{
"id": 100,
"questionText": "Scenario: After tuning, training accuracy improves but test accuracy drops. What happened?",
"options": [
"Overfitting",
"Underfitting",
"Learning rate too small",
"Too many missing values"
],
"correctAnswerIndex": 0,
"explanation": "Higher training accuracy with lower test performance indicates overfitting."
}
]
}