{ "title": "LightGBM Mastery: Part 1 (Q1–25)", "description": "Questions 1–25 covering LightGBM fundamentals — boosting basics, leaf-wise growth, histogram optimization, and key parameters.", "questions": [ { "id": 1, "questionText": "What type of algorithm is LightGBM based on?", "options": [ "Bagging", "Boosting", "Stacking", "Voting" ], "correctAnswerIndex": 1, "explanation": "LightGBM is a boosting algorithm, specifically a gradient boosting framework that builds models sequentially." }, { "id": 2, "questionText": "Which of the following is a unique characteristic of LightGBM?", "options": [ "It grows trees level-wise", "It grows trees leaf-wise", "It uses deep neural networks", "It averages models" ], "correctAnswerIndex": 1, "explanation": "LightGBM grows trees leaf-wise (best-first) to reduce loss more efficiently compared to level-wise methods." }, { "id": 3, "questionText": "What type of trees does LightGBM primarily use?", "options": [ "Shallow random trees", "Deep neural trees", "Decision trees", "Regression trees" ], "correctAnswerIndex": 3, "explanation": "LightGBM primarily uses regression trees for both classification and regression tasks." }, { "id": 4, "questionText": "Which technique allows LightGBM to handle large datasets efficiently?", "options": [ "Feature hashing", "Histogram-based algorithm", "Random sampling", "PCA compression" ], "correctAnswerIndex": 1, "explanation": "LightGBM uses a histogram-based algorithm to reduce computation by discretizing continuous features into bins." }, { "id": 5, "questionText": "Scenario: You have very large dataset with millions of rows. Why is LightGBM preferred over XGBoost?", "options": [ "Because it uses neural networks internally", "Because it uses histogram-based splits and leaf-wise growth for efficiency", "Because it reduces model interpretability", "Because it doesn’t require gradient computation" ], "correctAnswerIndex": 1, "explanation": "LightGBM is optimized for large datasets using histogram-based splits and leaf-wise tree growth." }, { "id": 6, "questionText": "Which LightGBM parameter controls the number of leaves in a single tree?", "options": [ "num_leaves", "max_depth", "min_child_samples", "n_estimators" ], "correctAnswerIndex": 0, "explanation": "The num_leaves parameter sets the maximum number of leaves in one tree, controlling model complexity." }, { "id": 7, "questionText": "What happens if num_leaves is set too high?", "options": [ "Model becomes underfit", "Model becomes overfit", "Model trains faster", "Model ignores small features" ], "correctAnswerIndex": 1, "explanation": "Too many leaves can make the model overly complex and prone to overfitting." }, { "id": 8, "questionText": "Which LightGBM parameter controls the learning rate?", "options": [ "shrinkage_rate", "alpha", "learning_rate", "lambda" ], "correctAnswerIndex": 2, "explanation": "The learning_rate parameter determines how much each new tree contributes to the model." }, { "id": 9, "questionText": "Scenario: Model accuracy stagnates early during training. Which parameter can you increase?", "options": [ "num_leaves", "learning_rate", "n_estimators", "feature_fraction" ], "correctAnswerIndex": 2, "explanation": "Increasing n_estimators (number of boosting iterations) allows the model to learn longer." }, { "id": 10, "questionText": "What is feature_fraction used for in LightGBM?", "options": [ "Regularization to reduce overfitting by randomly selecting a fraction of features per tree", "Adjusting leaf size", "Reducing data size by sampling rows", "Controlling the number of leaves" ], "correctAnswerIndex": 0, "explanation": "feature_fraction randomly selects a fraction of features to train each tree, helping regularize the model." }, { "id": 11, "questionText": "Which LightGBM parameter limits tree depth?", "options": [ "max_depth", "num_leaves", "min_split_gain", "subsample" ], "correctAnswerIndex": 0, "explanation": "max_depth directly limits how deep trees can grow, preventing overfitting." }, { "id": 12, "questionText": "What does min_child_samples control?", "options": [ "Minimum number of samples required in a leaf", "Minimum number of features used in a tree", "Minimum iterations before early stopping", "Minimum value for learning rate" ], "correctAnswerIndex": 0, "explanation": "min_child_samples ensures that leaves have enough data points, acting as a regularization technique." }, { "id": 13, "questionText": "Scenario: LightGBM model is overfitting. Which change helps reduce it?", "options": [ "Decrease num_leaves", "Increase num_leaves", "Increase learning rate", "Remove regularization" ], "correctAnswerIndex": 0, "explanation": "Reducing num_leaves decreases model complexity and helps combat overfitting." }, { "id": 14, "questionText": "What does boosting_type='dart' mean in LightGBM?", "options": [ "It uses Dropouts meet Multiple Additive Regression Trees", "It disables boosting", "It performs bagging only", "It builds random forests" ], "correctAnswerIndex": 0, "explanation": "The DART variant of LightGBM randomly drops trees during boosting to improve generalization." }, { "id": 15, "questionText": "Which LightGBM boosting type uses dropouts for regularization?", "options": [ "gbdt", "dart", "goss", "rf" ], "correctAnswerIndex": 1, "explanation": "The DART boosting type introduces dropout in boosting to prevent overfitting." }, { "id": 16, "questionText": "What does goss stand for in LightGBM?", "options": [ "Gradient-based One-Side Sampling", "Gradient Optimization Sampling System", "Global Outlier Sampling Strategy", "Generalized Optimization Split Search" ], "correctAnswerIndex": 0, "explanation": "GOSS is Gradient-based One-Side Sampling — a LightGBM optimization that speeds up training by sampling instances with large gradients." }, { "id": 17, "questionText": "Which LightGBM parameter helps in row subsampling?", "options": [ "bagging_fraction", "feature_fraction", "lambda_l1", "min_split_gain" ], "correctAnswerIndex": 0, "explanation": "bagging_fraction controls the fraction of data used per iteration, providing row-wise subsampling." }, { "id": 18, "questionText": "Scenario: You want faster training but can tolerate a small loss in accuracy. Which parameter can you reduce?", "options": [ "bagging_fraction", "num_leaves", "max_depth", "feature_fraction" ], "correctAnswerIndex": 0, "explanation": "Reducing bagging_fraction increases speed by using fewer data rows per iteration." }, { "id": 19, "questionText": "What does lambda_l1 control in LightGBM?", "options": [ "L1 regularization term on weights", "L2 regularization term on weights", "Dropout rate", "Learning rate decay" ], "correctAnswerIndex": 0, "explanation": "lambda_l1 adds L1 regularization on leaf weights to encourage sparsity and reduce overfitting." }, { "id": 20, "questionText": "Which LightGBM regularization term penalizes large leaf weights using L2 norm?", "options": [ "lambda_l1", "lambda_l2", "min_child_samples", "feature_fraction" ], "correctAnswerIndex": 1, "explanation": "lambda_l2 applies L2 regularization to prevent large leaf weights and stabilize training." }, { "id": 21, "questionText": "Scenario: Model accuracy is fluctuating during boosting. Which parameter helps smooth this effect?", "options": [ "learning_rate", "num_leaves", "min_gain_to_split", "bagging_freq" ], "correctAnswerIndex": 0, "explanation": "A smaller learning_rate helps stabilize model updates, reducing fluctuations." }, { "id": 22, "questionText": "What is the role of min_gain_to_split?", "options": [ "Minimum loss reduction required for a split", "Minimum number of leaves required per tree", "Maximum number of features allowed", "Learning rate decay factor" ], "correctAnswerIndex": 0, "explanation": "min_gain_to_split prevents small, insignificant splits by requiring a minimum loss reduction." }, { "id": 23, "questionText": "Scenario: Dataset contains categorical variables. How does LightGBM handle them efficiently?", "options": [ "Using one-hot encoding automatically", "By internally converting them using optimal split algorithms", "By ignoring categorical variables", "By treating them as numeric values directly" ], "correctAnswerIndex": 1, "explanation": "LightGBM natively supports categorical features by finding optimal split points without full one-hot encoding." }, { "id": 24, "questionText": "Which parameter in LightGBM is used to handle categorical features?", "options": [ "categorical_feature", "cat_var", "cat_split", "categorical_index" ], "correctAnswerIndex": 0, "explanation": "The categorical_feature parameter specifies which columns are treated as categorical during training." }, { "id": 25, "questionText": "Scenario: LightGBM is using GPU for training. Which advantage does this offer?", "options": [ "Faster histogram construction and split finding", "Automatic feature engineering", "Better interpretability", "Improved regularization" ], "correctAnswerIndex": 0, "explanation": "GPU acceleration speeds up histogram creation and split computations, significantly reducing training time." } ] } { "title": "LightGBM Mastery: Part 2 (Q26–50)", "description": "Questions 26–50 exploring LightGBM tuning — sampling, regularization, parameter interactions, parallelization, and practical training strategies.", "questions": [ { "id": 26, "questionText": "What does the bagging_freq parameter control in LightGBM?", "options": [ "How frequently bagging is performed during training", "Number of features to drop per tree", "Learning rate schedule frequency", "Regularization update interval" ], "correctAnswerIndex": 0, "explanation": "bagging_freq determines after how many boosting iterations LightGBM performs row subsampling." }, { "id": 27, "questionText": "Which LightGBM optimization helps in handling large-scale datasets efficiently?", "options": [ "Histogram-based binning", "Deep tree expansion", "Dynamic pruning", "Recurrent boosting" ], "correctAnswerIndex": 0, "explanation": "LightGBM uses histogram-based binning to reduce computation and memory usage for large datasets." }, { "id": 28, "questionText": "Scenario: Dataset contains extreme class imbalance. Which parameter helps mitigate it?", "options": [ "scale_pos_weight", "bagging_fraction", "num_leaves", "min_child_samples" ], "correctAnswerIndex": 0, "explanation": "scale_pos_weight adjusts the relative weight of positive samples to handle imbalanced datasets effectively." }, { "id": 29, "questionText": "Which LightGBM setting should be increased to make the model less sensitive to noise?", "options": [ "min_child_samples", "num_leaves", "learning_rate", "max_depth" ], "correctAnswerIndex": 0, "explanation": "Increasing min_child_samples ensures leaves contain more data points, making the model more robust to noise." }, { "id": 30, "questionText": "What is the role of early_stopping_round in LightGBM training?", "options": [ "Stops training when validation loss does not improve after a certain number of rounds", "Reduces learning rate automatically", "Saves best iteration for retraining", "Increases number of leaves gradually" ], "correctAnswerIndex": 0, "explanation": "early_stopping_round halts training if performance on validation data stops improving." }, { "id": 31, "questionText": "Scenario: You observe that LightGBM trains very fast but underfits. What adjustment helps?", "options": [ "Increase num_leaves or n_estimators", "Decrease learning rate", "Reduce max_depth", "Reduce feature_fraction" ], "correctAnswerIndex": 0, "explanation": "Increasing num_leaves or n_estimators allows the model to capture more complexity and reduce underfitting." }, { "id": 32, "questionText": "What does LightGBM’s leaf-wise tree growth mean?", "options": [ "It splits the leaf with the highest loss reduction first", "It splits all leaves at the same level simultaneously", "It grows the tree symmetrically", "It uses fixed depth trees" ], "correctAnswerIndex": 0, "explanation": "Leaf-wise growth selects and splits the leaf that gives the greatest loss reduction, leading to faster convergence." }, { "id": 33, "questionText": "Which parameter combination most affects model complexity?", "options": [ "num_leaves and max_depth", "feature_fraction and bagging_fraction", "learning_rate and n_estimators", "lambda_l1 and lambda_l2" ], "correctAnswerIndex": 0, "explanation": "num_leaves and max_depth jointly control tree structure and hence the complexity of the model." }, { "id": 34, "questionText": "Scenario: LightGBM runs out of memory on a massive dataset. Which setting helps reduce memory usage?", "options": [ "Reduce max_bin", "Increase learning_rate", "Set boosting_type to dart", "Increase num_leaves" ], "correctAnswerIndex": 0, "explanation": "Reducing max_bin decreases the number of histogram bins, lowering memory requirements." }, { "id": 35, "questionText": "What does the parameter max_bin represent in LightGBM?", "options": [ "Maximum number of bins to bucket continuous features", "Maximum number of leaves per tree", "Maximum depth of trees", "Maximum iterations for convergence" ], "correctAnswerIndex": 0, "explanation": "max_bin determines how many discrete bins each feature will be divided into during histogram building." }, { "id": 36, "questionText": "Scenario: Model training takes too long. Which adjustment improves speed most effectively?", "options": [ "Reduce max_bin or use bagging_fraction < 1", "Increase num_leaves", "Reduce learning_rate only", "Increase regularization terms" ], "correctAnswerIndex": 0, "explanation": "Reducing max_bin or using smaller bagging_fraction reduces the dataset processed each iteration, speeding up training." }, { "id": 37, "questionText": "What is the primary drawback of leaf-wise tree growth?", "options": [ "Higher risk of overfitting on small data", "Slower convergence", "Worse performance on large datasets", "Poor categorical handling" ], "correctAnswerIndex": 0, "explanation": "Leaf-wise growth can overfit on small datasets because it may produce very deep trees." }, { "id": 38, "questionText": "Which LightGBM parameter defines how many bins are created for each feature?", "options": [ "max_bin", "num_leaves", "feature_fraction", "max_depth" ], "correctAnswerIndex": 0, "explanation": "max_bin sets how finely continuous features are bucketed into discrete bins for histogram-based learning." }, { "id": 39, "questionText": "What type of regularization do lambda_l1 and lambda_l2 correspond to?", "options": [ "Lasso and Ridge regularization", "Elastic Net regularization", "Dropout regularization", "Tree pruning regularization" ], "correctAnswerIndex": 0, "explanation": "lambda_l1 and lambda_l2 implement Lasso (L1) and Ridge (L2) regularization respectively." }, { "id": 40, "questionText": "Scenario: You observe overfitting with high validation error. Which parameters help reduce it?", "options": [ "Increase min_child_samples, decrease num_leaves", "Increase learning_rate, increase num_leaves", "Reduce lambda_l2", "Increase feature_fraction" ], "correctAnswerIndex": 0, "explanation": "Increasing min_child_samples and reducing num_leaves simplify the model, reducing overfitting." }, { "id": 41, "questionText": "What is the role of monotone_constraints in LightGBM?", "options": [ "Ensure certain features have monotonic relationships with the target", "Enforce equal feature importance", "Reduce overfitting using L2 regularization", "Apply monotonic normalization to inputs" ], "correctAnswerIndex": 0, "explanation": "monotone_constraints force LightGBM to maintain a monotonic relationship for specific features." }, { "id": 42, "questionText": "Scenario: You want reproducible results from LightGBM training. Which parameter helps?", "options": [ "random_state", "seed", "boosting_seed", "Any of the above" ], "correctAnswerIndex": 3, "explanation": "Setting seed or random_state ensures deterministic behavior in LightGBM training." }, { "id": 43, "questionText": "Which LightGBM parameter defines the objective function?", "options": [ "objective", "metric", "boosting_type", "learning_rate" ], "correctAnswerIndex": 0, "explanation": "The objective parameter defines the loss function that LightGBM optimizes, e.g., 'binary', 'regression'." }, { "id": 44, "questionText": "Scenario: You are using LightGBM for multi-class classification. What should the objective be set to?", "options": [ "multiclass", "multiclassova", "binary", "regression" ], "correctAnswerIndex": 0, "explanation": "For multi-class classification, objective='multiclass' should be used with num_class specified." }, { "id": 45, "questionText": "What does feature_pre_filter in LightGBM control?", "options": [ "Whether features are pre-screened before training", "Feature normalization", "Automatic feature selection during training", "Dropout of low importance features" ], "correctAnswerIndex": 0, "explanation": "feature_pre_filter determines if LightGBM filters out constant or low-variance features before training." }, { "id": 46, "questionText": "Scenario: You use categorical features in LightGBM. What advantage does native support provide?", "options": [ "Faster training and memory efficiency", "One-hot encoding automatically expands features", "Improved interpretability", "Model regularization" ], "correctAnswerIndex": 0, "explanation": "Native categorical handling avoids one-hot expansion, leading to faster and more efficient training." }, { "id": 47, "questionText": "Which LightGBM boosting type is best for highly imbalanced datasets?", "options": [ "goss", "gbdt", "dart", "rf" ], "correctAnswerIndex": 0, "explanation": "GOSS (Gradient-based One-Side Sampling) is efficient and works well for imbalanced datasets by focusing on large-gradient samples." }, { "id": 48, "questionText": "Scenario: LightGBM shows slightly worse accuracy than XGBoost. Which parameter tuning may help?", "options": [ "Reduce learning_rate and increase n_estimators", "Increase feature_fraction", "Reduce num_leaves", "Disable histogram optimization" ], "correctAnswerIndex": 0, "explanation": "Decreasing learning_rate while increasing n_estimators allows more refined learning and may improve accuracy." }, { "id": 49, "questionText": "Which LightGBM metric should you use for binary classification?", "options": [ "binary_logloss", "l2", "mae", "multi_logloss" ], "correctAnswerIndex": 0, "explanation": "binary_logloss measures the log loss for binary classification tasks." }, { "id": 50, "questionText": "Scenario: You want to save training time without losing much performance. Which parameters can be combined?", "options": [ "Use smaller max_bin, bagging_fraction, and feature_fraction", "Increase num_leaves and n_estimators", "Reduce learning_rate only", "Disable regularization terms" ], "correctAnswerIndex": 0, "explanation": "Reducing max_bin, bagging_fraction, and feature_fraction reduces training cost while maintaining accuracy." } ] } { "title": "LightGBM Mastery: Part 3 (Q51–75)", "description": "Intermediate to advanced questions (51–75) exploring LightGBM regularization, overfitting control, advanced parameters, and real-world optimization scenarios.", "questions": [ { "id": 51, "questionText": "What is the function of lambda_l1 in LightGBM?", "options": [ "Applies L1 regularization to leaf weights", "Limits number of leaves per tree", "Determines the learning rate", "Applies L2 regularization" ], "correctAnswerIndex": 0, "explanation": "lambda_l1 controls L1 regularization on leaf weights to reduce overfitting." }, { "id": 52, "questionText": "What is the function of lambda_l2 in LightGBM?", "options": [ "Applies L2 regularization to leaf weights", "Controls learning rate", "Reduces feature fraction", "Prunes shallow trees" ], "correctAnswerIndex": 0, "explanation": "lambda_l2 applies L2 regularization on leaf weights to smooth large values and improve generalization." }, { "id": 53, "questionText": "Scenario: You notice LightGBM overfits heavily. Which parameters could help?", "options": [ "Decrease num_leaves, increase min_data_in_leaf", "Increase num_leaves, lower min_data_in_leaf", "Increase learning rate", "Increase max_depth only" ], "correctAnswerIndex": 0, "explanation": "Reducing num_leaves and increasing min_data_in_leaf makes trees simpler and prevents overfitting." }, { "id": 54, "questionText": "What does 'min_split_gain' control in LightGBM?", "options": [ "Minimum gain required to perform a split", "Maximum gain per leaf", "Learning rate adjustment", "Subsample fraction" ], "correctAnswerIndex": 0, "explanation": "min_split_gain sets a threshold for information gain; splits below this are ignored." }, { "id": 55, "questionText": "Which LightGBM parameter controls row sampling per iteration?", "options": [ "bagging_fraction", "feature_fraction", "num_leaves", "max_bin" ], "correctAnswerIndex": 0, "explanation": "bagging_fraction randomly samples rows for each boosting round to reduce variance." }, { "id": 56, "questionText": "What does 'feature_fraction' control in LightGBM?", "options": [ "Fraction of features used per tree", "Number of bins per feature", "Maximum depth of tree", "Feature scaling" ], "correctAnswerIndex": 0, "explanation": "feature_fraction specifies the proportion of features used to build each tree." }, { "id": 57, "questionText": "Scenario: Increasing feature_fraction improves accuracy but lowers stability. Why?", "options": [ "More features increase model variance", "Feature_fraction reduces bias", "Learning rate decreases automatically", "Tree depth is fixed" ], "correctAnswerIndex": 0, "explanation": "Using more features per iteration reduces randomness and increases variance, which may reduce stability." }, { "id": 58, "questionText": "Which LightGBM parameter controls data sampling frequency?", "options": [ "bagging_freq", "num_iterations", "min_child_samples", "max_depth" ], "correctAnswerIndex": 0, "explanation": "bagging_freq controls how often (in iterations) row subsampling is performed." }, { "id": 59, "questionText": "Scenario: You set bagging_fraction=1.0 and feature_fraction=1.0. Effect?", "options": [ "No random sampling; all data and features used every iteration", "Strong regularization", "Improved generalization", "Subsampling increases variance" ], "correctAnswerIndex": 0, "explanation": "Setting both to 1.0 disables random sampling, using all features and samples every iteration." }, { "id": 60, "questionText": "What does 'max_bin' affect in LightGBM?", "options": [ "Precision of feature discretization", "Learning rate", "Tree depth", "Bagging rate" ], "correctAnswerIndex": 0, "explanation": "max_bin determines how many bins each continuous feature is bucketed into for histogram-based splitting." }, { "id": 61, "questionText": "Higher max_bin values typically lead to what?", "options": [ "More precise splits but slower training", "Faster training with less precision", "More regularization", "Smaller trees" ], "correctAnswerIndex": 0, "explanation": "Increasing max_bin gives more precise splits but increases memory and training time." }, { "id": 62, "questionText": "Scenario: Large dataset with limited RAM. What should you adjust?", "options": [ "Decrease max_bin and num_leaves", "Increase learning rate", "Disable histogram mode", "Increase max_depth" ], "correctAnswerIndex": 0, "explanation": "Reducing max_bin and num_leaves lowers memory footprint and speeds up training." }, { "id": 63, "questionText": "What is the function of 'min_data_in_leaf'?", "options": [ "Minimum number of samples required to form a leaf", "Maximum depth limit", "Learning rate controller", "Number of leaves in total" ], "correctAnswerIndex": 0, "explanation": "min_data_in_leaf ensures a minimum number of samples per leaf to avoid overfitting." }, { "id": 64, "questionText": "What happens if min_data_in_leaf is set too high?", "options": [ "Model underfits due to shallow trees", "Model overfits easily", "Learning rate decreases", "Training stops early" ], "correctAnswerIndex": 0, "explanation": "Too high min_data_in_leaf makes leaves large and reduces model complexity, causing underfitting." }, { "id": 65, "questionText": "What parameter limits the maximum tree depth in LightGBM?", "options": [ "max_depth", "num_leaves", "min_data_in_leaf", "feature_fraction" ], "correctAnswerIndex": 0, "explanation": "max_depth caps how deep each tree can grow." }, { "id": 66, "questionText": "Scenario: You set max_depth=-1. What happens?", "options": [ "Tree depth is unlimited and controlled by num_leaves instead", "Training fails", "Trees become shallow automatically", "Regularization is disabled" ], "correctAnswerIndex": 0, "explanation": "Setting max_depth=-1 removes explicit depth restriction; num_leaves indirectly limits complexity." }, { "id": 67, "questionText": "What is the effect of increasing num_iterations in LightGBM?", "options": [ "Model trains longer and may overfit if learning_rate is not reduced", "Model converges faster", "Less accurate model", "Shallower trees" ], "correctAnswerIndex": 0, "explanation": "More boosting iterations improve fit but can overfit unless compensated by lower learning rate." }, { "id": 68, "questionText": "Scenario: Reducing learning_rate but keeping num_iterations constant causes?", "options": [ "Underfitting, since model learns slower", "Overfitting", "Higher variance", "Deeper trees" ], "correctAnswerIndex": 0, "explanation": "Low learning rate with few iterations may lead to underfitting as the model learns too slowly." }, { "id": 69, "questionText": "What is the main benefit of histogram-based decision trees in LightGBM?", "options": [ "Faster training and lower memory usage", "More precise split thresholds", "Supports only small datasets", "Improves interpretability" ], "correctAnswerIndex": 0, "explanation": "Histogram-based methods speed up training by grouping continuous values into discrete bins." }, { "id": 70, "questionText": "Scenario: You increase max_bin significantly. What might happen?", "options": [ "Training slows down and may overfit", "Training speeds up", "Model ignores rare features", "Learning rate increases automatically" ], "correctAnswerIndex": 0, "explanation": "Higher max_bin allows finer splits but can increase overfitting and computation time." }, { "id": 71, "questionText": "What parameter controls the number of boosting rounds?", "options": [ "num_iterations", "max_depth", "feature_fraction", "min_data_in_leaf" ], "correctAnswerIndex": 0, "explanation": "num_iterations defines the total number of boosting rounds (trees) to train." }, { "id": 72, "questionText": "Scenario: Decreasing num_iterations while keeping learning_rate fixed will usually?", "options": [ "Reduce model capacity and may underfit", "Cause overfitting", "Speed up convergence with higher accuracy", "Have no effect" ], "correctAnswerIndex": 0, "explanation": "Fewer iterations reduce model capacity, leading to underfitting if learning_rate is unchanged." }, { "id": 73, "questionText": "What is the benefit of early_stopping_rounds in LightGBM?", "options": [ "Automatically halts training when validation loss stops improving", "Reduces learning rate dynamically", "Increases tree depth automatically", "Samples more features" ], "correctAnswerIndex": 0, "explanation": "early_stopping_rounds prevents overfitting by stopping when performance stops improving on validation data." }, { "id": 74, "questionText": "Which parameter combination best prevents overfitting?", "options": [ "Lower num_leaves, lower learning_rate, higher min_data_in_leaf", "Higher num_leaves, higher learning_rate", "Increase max_depth only", "Set bagging_fraction=1" ], "correctAnswerIndex": 0, "explanation": "Simpler trees, smaller learning rate, and more data per leaf enhance generalization." }, { "id": 75, "questionText": "Scenario: Large data, strong overfitting, and high variance. What to do?", "options": [ "Lower num_leaves, use bagging and feature_fraction < 1", "Increase tree depth", "Raise learning rate", "Disable regularization" ], "correctAnswerIndex": 0, "explanation": "Using smaller trees and random sampling helps reduce overfitting and variance." } ] } { "title": "LightGBM Mastery: Part 4 (Q76–100)", "description": "Advanced and expert-level questions (76–100) exploring LightGBM’s GPU acceleration, categorical feature encoding, distributed learning, interpretability, and fine-tuning strategies.", "questions": [ { "id": 76, "questionText": "What is one key advantage of LightGBM over XGBoost?", "options": [ "Uses leaf-wise tree growth for faster convergence", "Uses level-wise tree growth for stability", "Cannot handle large datasets", "Lacks regularization" ], "correctAnswerIndex": 0, "explanation": "LightGBM grows trees leaf-wise with depth constraints, achieving faster convergence and lower loss." }, { "id": 77, "questionText": "What happens if num_leaves is much larger than 2^max_depth?", "options": [ "Overfitting increases due to overly complex trees", "Model underfits severely", "No effect on model performance", "Training halts automatically" ], "correctAnswerIndex": 0, "explanation": "Too many leaves compared to max_depth allow excessive branching, causing overfitting." }, { "id": 78, "questionText": "Scenario: You enable GPU support in LightGBM. What primary benefit is expected?", "options": [ "Faster histogram construction and split finding", "Improved accuracy", "Reduced model complexity", "Automatic regularization" ], "correctAnswerIndex": 0, "explanation": "GPU acceleration speeds up histogram building and split calculations, improving training speed." }, { "id": 79, "questionText": "Which LightGBM parameter enables GPU training?", "options": [ "device_type='gpu'", "gpu_enable=True", "use_gpu=1", "boosting_type='gpu'" ], "correctAnswerIndex": 0, "explanation": "Setting device_type='gpu' tells LightGBM to use GPU resources for training." }, { "id": 80, "questionText": "What is the impact of 'boosting_type' parameter?", "options": [ "Selects the boosting algorithm (gbdt, dart, goss)", "Controls feature sampling", "Defines tree depth", "Applies learning rate decay" ], "correctAnswerIndex": 0, "explanation": "boosting_type specifies the boosting algorithm variant such as 'gbdt', 'dart', or 'goss'." }, { "id": 81, "questionText": "What is DART in LightGBM?", "options": [ "Dropouts meet Multiple Additive Regression Trees", "Distributed Automatic Regression Tree", "Dynamic Adaptive Regularized Trees", "Data Adaptive Reduction Technique" ], "correctAnswerIndex": 0, "explanation": "DART is 'Dropouts meet Multiple Additive Regression Trees', introducing dropout into boosting to reduce overfitting." }, { "id": 82, "questionText": "Scenario: Using boosting_type='goss'. What does GOSS stand for?", "options": [ "Gradient-based One-Side Sampling", "Global Overfitting Sample Selector", "Generalized Optimization for Split Search", "Gradient Optimization Sampling Strategy" ], "correctAnswerIndex": 0, "explanation": "GOSS stands for Gradient-based One-Side Sampling, reducing data processed per iteration for speed." }, { "id": 83, "questionText": "What does GOSS primarily do?", "options": [ "Keeps large-gradient samples and randomly drops small-gradient ones", "Drops large-gradient samples", "Uses all samples equally", "Increases number of trees" ], "correctAnswerIndex": 0, "explanation": "GOSS keeps high-gradient samples for training, reducing computation while preserving accuracy." }, { "id": 84, "questionText": "Scenario: Dataset has many categorical variables. What should you do?", "options": [ "Use LightGBM's built-in categorical feature support", "One-hot encode all features manually", "Convert to text data", "Ignore categorical columns" ], "correctAnswerIndex": 0, "explanation": "LightGBM natively supports categorical features through optimal split encoding without full one-hot expansion." }, { "id": 85, "questionText": "How does LightGBM handle categorical features internally?", "options": [ "Sorts categories by average target and finds best split", "Performs label encoding only", "Uses frequency encoding", "Applies hash bucketing" ], "correctAnswerIndex": 0, "explanation": "LightGBM sorts categories by their target mean to efficiently find the best split." }, { "id": 86, "questionText": "Scenario: Training time is long on large data with many features. What can help?", "options": [ "Reduce feature_fraction and bagging_fraction", "Increase num_leaves", "Disable histogram mode", "Increase max_bin drastically" ], "correctAnswerIndex": 0, "explanation": "Reducing feature_fraction and bagging_fraction speeds up training by using subsets of features and samples." }, { "id": 87, "questionText": "What is the role of 'max_cat_threshold'?", "options": [ "Controls maximum thresholds for categorical splits", "Limits maximum tree depth", "Sets number of categories allowed", "Defines learning rate schedule" ], "correctAnswerIndex": 0, "explanation": "max_cat_threshold limits how many thresholds LightGBM evaluates for categorical splits." }, { "id": 88, "questionText": "Scenario: Distributed LightGBM training is producing inconsistent results. Likely reason?", "options": [ "Non-deterministic data shuffling or parameter differences across nodes", "Too high learning rate", "Disabled GPU support", "Overfitting due to small num_leaves" ], "correctAnswerIndex": 0, "explanation": "Different random seeds or node configurations in distributed mode can cause inconsistency." }, { "id": 89, "questionText": "What helps ensure reproducible LightGBM results?", "options": [ "Set deterministic=True and fix random_seed", "Increase bagging_fraction", "Enable GPU mode", "Reduce learning rate" ], "correctAnswerIndex": 0, "explanation": "Setting deterministic=True and fixing random_seed ensures consistent results across runs." }, { "id": 90, "questionText": "Which LightGBM feature allows parallel learning across machines?", "options": [ "Distributed training mode", "Bagging", "GPU histograms", "Early stopping" ], "correctAnswerIndex": 0, "explanation": "Distributed mode enables training across multiple machines using data parallelism." }, { "id": 91, "questionText": "What is the key difference between DART and standard GBDT?", "options": [ "DART randomly drops trees during training to prevent overfitting", "DART doubles learning rate dynamically", "DART uses fewer features per tree", "DART cannot perform regression tasks" ], "correctAnswerIndex": 0, "explanation": "DART introduces dropout on trees, improving regularization and generalization." }, { "id": 92, "questionText": "Scenario: Validation accuracy fluctuates heavily between iterations. Likely cause?", "options": [ "Learning rate too high or bagging too aggressive", "Too many trees", "Too few bins", "High lambda_l2" ], "correctAnswerIndex": 0, "explanation": "High learning rate or aggressive subsampling can cause instability in validation metrics." }, { "id": 93, "questionText": "What does 'linear_tree' parameter enable?", "options": [ "Adds linear models to each leaf for hybrid boosting", "Switches boosting type", "Performs polynomial regression", "Forces shallow trees" ], "correctAnswerIndex": 0, "explanation": "linear_tree enables a linear model within each leaf, combining tree and linear learning." }, { "id": 94, "questionText": "Scenario: Using linear_tree improved performance slightly but increased training time. Why?", "options": [ "Linear models per leaf require additional optimization", "Learning rate reduced automatically", "Tree structure became shallower", "Fewer bins created per feature" ], "correctAnswerIndex": 0, "explanation": "Each leaf fits a small linear model, increasing training computation but often improving accuracy." }, { "id": 95, "questionText": "Which LightGBM setting improves memory efficiency on large data?", "options": [ "Use histogram pool sharing and smaller max_bin", "Increase max_depth", "Enable linear_tree", "Disable bagging" ], "correctAnswerIndex": 0, "explanation": "Reducing max_bin and using histogram sharing significantly lower memory usage." }, { "id": 96, "questionText": "Scenario: Feature importance shows unexpected zeros for numeric features. Why?", "options": [ "Feature was rarely used due to high correlation or low information gain", "Model error", "Bug in LightGBM", "Feature_fraction=1.0" ], "correctAnswerIndex": 0, "explanation": "Highly correlated or uninformative features may never be chosen for splits, yielding zero importance." }, { "id": 97, "questionText": "What is the purpose of 'monotone_constraints'?", "options": [ "Forces model predictions to follow specified monotonic relationships with features", "Restricts tree depth", "Balances data classes", "Disables early stopping" ], "correctAnswerIndex": 0, "explanation": "monotone_constraints ensure predictions move consistently up or down with certain features." }, { "id": 98, "questionText": "Scenario: You set monotone_constraints incorrectly. Possible issue?", "options": [ "Model accuracy drops or fails to converge", "Training halts immediately", "All features are ignored", "Learning rate resets" ], "correctAnswerIndex": 0, "explanation": "Wrong monotonic constraints can make optimization infeasible, harming accuracy or convergence." }, { "id": 99, "questionText": "What metric would you monitor for binary classification?", "options": [ "binary_logloss or AUC", "mean_squared_error", "poisson", "quantile" ], "correctAnswerIndex": 0, "explanation": "For binary tasks, LightGBM supports metrics like binary_logloss and AUC for evaluation." }, { "id": 100, "questionText": "Scenario: After tuning, training accuracy improves but test accuracy drops. What happened?", "options": [ "Overfitting", "Underfitting", "Learning rate too small", "Too many missing values" ], "correctAnswerIndex": 0, "explanation": "Higher training accuracy with lower test performance indicates overfitting." } ] }