{ "title": "Deep Reinforcement Learning Mastery: 100 MCQs", "description": "A comprehensive set of 100 multiple-choice questions to test and deepen your understanding of Deep Reinforcement Learning, from basic concepts to advanced topics like Deep Q-Networks, Policy Gradients, and Actor-Critic methods.", "questions": [ { "id": 1, "questionText": "What is the main difference between standard Q-Learning and Deep Q-Learning?", "options": [ "Q-Learning ignores rewards", "Deep Q-Learning uses a neural network to approximate Q-values", "Q-Learning uses continuous actions", "Deep Q-Learning requires supervised labels" ], "correctAnswerIndex": 1, "explanation": "Deep Q-Learning replaces the Q-table with a neural network to handle large or continuous state spaces." }, { "id": 2, "questionText": "Which problem does function approximation in Deep Reinforcement Learning solve?", "options": [ "Exploration vs. exploitation", "Handling large or continuous state spaces", "Reward shaping", "Reducing discount factor" ], "correctAnswerIndex": 1, "explanation": "Function approximation allows the agent to generalize Q-values across many states instead of storing a table." }, { "id": 3, "questionText": "In Deep Q-Networks (DQN), what is the purpose of the target network?", "options": [ "Generate random actions", "Provide rewards", "Stabilize learning by providing fixed Q-value targets periodically", "Replace policy network completely" ], "correctAnswerIndex": 2, "explanation": "The target network reduces oscillations by keeping Q-value targets fixed for several updates before copying from the main network." }, { "id": 4, "questionText": "What is experience replay in DRL?", "options": [ "Resetting the environment", "Storing past experiences and sampling randomly for training", "Recording rewards only", "Using supervised labels" ], "correctAnswerIndex": 1, "explanation": "Experience replay breaks correlation between sequential data and improves learning stability by training on randomly sampled past experiences." }, { "id": 5, "questionText": "Which activation function is commonly used in DRL networks?", "options": [ "Linear only", "ReLU", "Tanh only", "Sigmoid only" ], "correctAnswerIndex": 1, "explanation": "ReLU is commonly used due to its efficiency and ability to reduce vanishing gradient problems." }, { "id": 6, "questionText": "In DRL, what is the role of the discount factor γ?", "options": [ "Control neural network learning rate", "Weigh future rewards relative to immediate rewards", "Scale input features", "Select actions randomly" ], "correctAnswerIndex": 1, "explanation": "The discount factor balances the importance of immediate versus future rewards in value estimation." }, { "id": 7, "questionText": "Why is Q-learning considered off-policy?", "options": [ "It uses supervised labels", "It updates values using the best next action, not necessarily the action taken", "It follows a fixed policy only", "It ignores rewards" ], "correctAnswerIndex": 1, "explanation": "Off-policy learning uses the optimal action for updating Q-values regardless of the agent’s actual behavior policy." }, { "id": 8, "questionText": "Which problem does target network in DQN help to mitigate?", "options": [ "Exploration failure", "Instability due to moving Q-value targets", "Sparse rewards", "Reward hacking" ], "correctAnswerIndex": 1, "explanation": "Fixing Q-value targets for several steps reduces oscillations and divergence in neural network training." }, { "id": 9, "questionText": "In DRL, why is experience replay preferred over online updates?", "options": [ "Requires supervised data", "Reduces correlation between consecutive samples", "Only works with deterministic environments", "Avoids using discount factor" ], "correctAnswerIndex": 1, "explanation": "Sampling random experiences helps the network learn more effectively from diverse states and actions." }, { "id": 10, "questionText": "What is a primary advantage of Deep Q-Networks over tabular Q-Learning?", "options": [ "Eliminates exploration", "Removes the need for discount factor", "Can handle high-dimensional or continuous state spaces", "Requires fewer rewards" ], "correctAnswerIndex": 2, "explanation": "DQN can generalize across large state spaces using neural networks instead of storing Q-values in a table." }, { "id": 11, "questionText": "Which of the following is a common DRL benchmark environment?", "options": [ "ImageNet", "MNIST", "OpenAI Gym", "COCO" ], "correctAnswerIndex": 2, "explanation": "OpenAI Gym provides standardized environments for testing DRL algorithms." }, { "id": 12, "questionText": "Why is gradient clipping used in DRL?", "options": [ "Reduce discount factor", "Increase rewards artificially", "Control exploration rate", "Prevent exploding gradients during neural network training" ], "correctAnswerIndex": 3, "explanation": "Gradient clipping limits extreme weight updates, improving stability." }, { "id": 13, "questionText": "Which is true about the replay buffer size in DQN?", "options": [ "Size does not affect learning", "Larger buffers store more diverse experiences but use more memory", "Buffers store only rewards", "Small buffers always converge faster" ], "correctAnswerIndex": 1, "explanation": "Larger buffers provide better sample diversity, improving learning stability." }, { "id": 14, "questionText": "In DRL, what is the main challenge with continuous action spaces?", "options": [ "Exploration is unnecessary", "Rewards cannot be used", "Q-Learning requires discrete actions; approximation methods like DDPG are needed", "Discount factor cannot be applied" ], "correctAnswerIndex": 2, "explanation": "Discrete action Q-learning cannot directly handle continuous actions; actor-critic methods or policy gradients are used." }, { "id": 15, "questionText": "What is the purpose of a policy network in policy gradient methods?", "options": [ "Estimates Q-values", "Directly outputs action probabilities for a given state", "Stores experiences", "Generates rewards" ], "correctAnswerIndex": 1, "explanation": "Policy networks map states to action probabilities, allowing stochastic or deterministic policies." }, { "id": 16, "questionText": "Which DRL method is considered on-policy?", "options": [ "Double Q-Learning", "Experience Replay only", "Deep Q-Network (DQN)", "REINFORCE" ], "correctAnswerIndex": 3, "explanation": "REINFORCE updates the policy based on actions actually taken by the agent, making it on-policy." }, { "id": 17, "questionText": "Which type of neural network is commonly used in DRL for image inputs?", "options": [ "Recurrent Neural Networks (RNNs) only", "Fully connected only", "Autoencoders only", "Convolutional Neural Networks (CNNs)" ], "correctAnswerIndex": 3, "explanation": "CNNs extract spatial features from images for DRL agents like in Atari games." }, { "id": 18, "questionText": "In DRL, what is a major issue with high variance in policy gradient methods?", "options": [ "Q-values are ignored", "Exploration becomes deterministic", "Slow and unstable learning", "Rewards become negative" ], "correctAnswerIndex": 2, "explanation": "High variance can make gradient updates noisy, slowing convergence." }, { "id": 19, "questionText": "Which technique reduces variance in policy gradient updates?", "options": [ "Use a baseline or advantage function", "Use greedy policy", "Increase discount factor", "Ignore rewards" ], "correctAnswerIndex": 0, "explanation": "Subtracting a baseline (like state value) reduces variance while keeping the estimate unbiased." }, { "id": 20, "questionText": "What is the main advantage of Actor-Critic methods over DQN?", "options": [ "Eliminates exploration", "Requires tabular Q-table", "Can handle continuous actions and reduce variance with a value estimator", "Removes the need for rewards" ], "correctAnswerIndex": 2, "explanation": "Actor-Critic combines policy learning (actor) with value estimation (critic) for better performance, especially in continuous domains." }, { "id": 21, "questionText": "Which DRL algorithm is suitable for continuous action control?", "options": [ "DQN", "SARSA", "Deep Deterministic Policy Gradient (DDPG)", "Monte Carlo only" ], "correctAnswerIndex": 2, "explanation": "DDPG can output continuous actions using an actor network and learn value using a critic network." }, { "id": 22, "questionText": "Why are target networks important in DRL?", "options": [ "Store experience replay", "Control exploration", "Generate rewards", "Stabilize training by reducing oscillations in Q-value targets" ], "correctAnswerIndex": 3, "explanation": "Without a target network, the moving Q-value targets cause instability during neural network updates." }, { "id": 23, "questionText": "What is a common solution for partially observable environments in DRL?", "options": [ "Use Recurrent Neural Networks (RNNs) to remember past states", "Use tabular Q-Learning", "Increase discount factor", "Ignore history" ], "correctAnswerIndex": 0, "explanation": "RNNs allow the agent to maintain an internal state, improving decisions in partially observable settings." }, { "id": 24, "questionText": "Which method combines policy gradients and value estimation for stability?", "options": [ "Monte Carlo only", "SARSA only", "Advantage Actor-Critic (A2C)", "DQN only" ], "correctAnswerIndex": 2, "explanation": "A2C uses a critic to estimate value and an actor to update policy, reducing variance and improving learning." }, { "id": 25, "questionText": "In DRL, what is the purpose of epsilon-greedy policy?", "options": [ "Balance exploration and exploitation", "Reduce network size", "Ignore rewards", "Stabilize gradients" ], "correctAnswerIndex": 0, "explanation": "Epsilon-greedy chooses random actions with probability ε to explore the environment while mostly exploiting the best-known action." }, { "id": 26, "questionText": "Which optimization algorithm is commonly used to train DRL networks?", "options": [ "SGD only", "Adam", "None", "RMSProp only" ], "correctAnswerIndex": 1, "explanation": "Adam combines momentum and adaptive learning rates, making it effective for DRL training." }, { "id": 27, "questionText": "What is reward shaping in DRL?", "options": [ "Changing discount factor", "Modifying the reward signal to provide intermediate feedback", "Removing rewards", "Randomizing actions" ], "correctAnswerIndex": 1, "explanation": "Reward shaping provides more frequent feedback to accelerate learning while keeping the optimal policy unchanged." }, { "id": 28, "questionText": "Why is clipping rewards sometimes used in DRL?", "options": [ "Increase exploration", "Reduce network size", "Prevent large gradients and stabilize training", "Ignore rewards" ], "correctAnswerIndex": 2, "explanation": "Clipping avoids extremely large reward signals that can destabilize learning." }, { "id": 29, "questionText": "Which DRL method directly models a stochastic policy?", "options": [ "Policy Gradient (REINFORCE)", "DQN", "SARSA", "Double Q-Learning" ], "correctAnswerIndex": 0, "explanation": "Policy gradients learn a probability distribution over actions, allowing stochastic action selection." }, { "id": 30, "questionText": "In DRL, why is it important to normalize inputs?", "options": [ "Q-values become zero", "Rewards are ignored", "Exploration is unnecessary", "Neural network training is more stable and faster" ], "correctAnswerIndex": 3, "explanation": "Normalized inputs prevent large-scale differences that can hinder learning and slow convergence." }, { "id": 31, "questionText": "What is the key idea behind Double DQN?", "options": [ "Use two policies to explore the environment", "Combine policy gradient with Q-learning", "Update Q-values twice per step", "Use one network for action selection and another for evaluation to reduce overestimation" ], "correctAnswerIndex": 3, "explanation": "Double DQN separates action selection and Q-value evaluation to mitigate overestimation bias seen in standard DQN." }, { "id": 32, "questionText": "What is the purpose of prioritized experience replay?", "options": [ "Sample important experiences more frequently to improve learning efficiency", "Ignore old experiences", "Store only positive rewards", "Replay experiences in sequential order" ], "correctAnswerIndex": 0, "explanation": "Prioritized replay focuses learning on transitions with higher temporal-difference errors, improving convergence speed." }, { "id": 33, "questionText": "Which DRL algorithm is suitable for continuous control tasks?", "options": [ "DQN", "Deep Deterministic Policy Gradient (DDPG)", "Q-Learning", "SARSA" ], "correctAnswerIndex": 1, "explanation": "DDPG can handle continuous action spaces using an actor network to output continuous actions and a critic network to estimate values." }, { "id": 34, "questionText": "In Actor-Critic methods, what is the role of the critic?", "options": [ "Modify rewards", "Estimate the value function to guide the actor", "Store replay memory", "Select actions randomly" ], "correctAnswerIndex": 1, "explanation": "The critic evaluates the current policy by estimating the expected return, providing feedback to the actor for policy improvement." }, { "id": 35, "questionText": "Which advantage does A3C (Asynchronous Advantage Actor-Critic) provide over standard Actor-Critic?", "options": [ "Removes the need for value estimation", "Eliminates exploration", "Uses only one agent to reduce computation", "Parallel training with multiple agents to stabilize learning" ], "correctAnswerIndex": 3, "explanation": "A3C uses multiple asynchronous agents exploring in parallel, which stabilizes learning and improves convergence speed." }, { "id": 36, "questionText": "What is the main challenge of high-dimensional state spaces in DRL?", "options": [ "Rewards become negative", "Learning rate α becomes zero", "Discount factor is ignored", "Curse of dimensionality increases sample complexity" ], "correctAnswerIndex": 3, "explanation": "High-dimensional inputs require more data to learn effective policies and can slow convergence." }, { "id": 37, "questionText": "Which technique helps DRL agents learn from visual input effectively?", "options": [ "RNNs only", "Convolutional Neural Networks (CNNs)", "Decision trees", "Fully connected networks only" ], "correctAnswerIndex": 1, "explanation": "CNNs extract spatial features from images, enabling DRL agents to handle complex visual environments." }, { "id": 38, "questionText": "In DDPG, why is it necessary to add noise to actions during training?", "options": [ "Promote exploration in continuous action spaces", "Stabilize the target network", "Reduce rewards", "Increase discount factor" ], "correctAnswerIndex": 0, "explanation": "Exploration is crucial in continuous action spaces; adding noise ensures the agent explores various actions." }, { "id": 39, "questionText": "What is the purpose of advantage function in A2C or A3C?", "options": [ "Reduce variance in policy gradient updates", "Ignore state values", "Store experiences", "Increase rewards" ], "correctAnswerIndex": 0, "explanation": "Advantage function measures how much better an action is compared to the expected value, reducing variance in updates." }, { "id": 40, "questionText": "Which of the following is a major limitation of vanilla policy gradients?", "options": [ "Cannot handle discrete actions", "Requires tabular Q-table", "High variance in gradient estimates", "Ignores rewards" ], "correctAnswerIndex": 2, "explanation": "Vanilla policy gradients have high variance, making learning slow and unstable." }, { "id": 41, "questionText": "Why is normalization of input features important in DRL?", "options": [ "Increases rewards artificially", "Stabilizes neural network training and improves convergence", "Reduces exploration", "Removes discount factor" ], "correctAnswerIndex": 1, "explanation": "Normalization prevents large-scale differences that could destabilize learning and slow down convergence." }, { "id": 42, "questionText": "In DRL, what is the role of target smoothing in DDPG?", "options": [ "Prevent oscillations by slowly updating target networks", "Ignore discount factor", "Randomize actions", "Generate rewards" ], "correctAnswerIndex": 0, "explanation": "Soft updates of the target network improve training stability by avoiding large sudden changes in Q-values." }, { "id": 43, "questionText": "What does the term 'on-policy' mean in DRL?", "options": [ "Agent stores experiences only", "Agent ignores rewards", "Agent uses a separate policy for evaluation", "Agent updates policy using actions it actually takes" ], "correctAnswerIndex": 3, "explanation": "On-policy methods learn the value of the policy being executed, unlike off-policy methods which can learn from other policies." }, { "id": 44, "questionText": "What does 'off-policy' learning in DRL allow?", "options": [ "Ignoring rewards", "Reducing discount factor to zero", "Only learning from current policy", "Learning optimal policy using experiences from a different behavior policy" ], "correctAnswerIndex": 3, "explanation": "Off-policy learning allows using past experiences or exploratory actions to learn the optimal policy." }, { "id": 45, "questionText": "Which method is used to reduce correlation between consecutive samples in DRL?", "options": [ "Target networks only", "Policy gradient", "Greedy policy", "Experience replay" ], "correctAnswerIndex": 3, "explanation": "Experience replay randomly samples past experiences, breaking temporal correlations and improving learning stability." }, { "id": 46, "questionText": "Which DRL algorithm is suitable for environments with discrete action spaces?", "options": [ "DDPG", "Policy Gradient with continuous actor", "SARSA only", "Deep Q-Network (DQN)" ], "correctAnswerIndex": 3, "explanation": "DQN works well in discrete action spaces by estimating Q-values for all possible actions." }, { "id": 47, "questionText": "Why is reward clipping sometimes applied in DRL?", "options": [ "Normalize inputs", "Prevent very large rewards from destabilizing training", "Increase exploration", "Reduce discount factor" ], "correctAnswerIndex": 1, "explanation": "Clipping rewards prevents extreme updates in the network that could destabilize learning." }, { "id": 48, "questionText": "What is the main advantage of using Actor-Critic over pure policy gradients?", "options": [ "Removes discount factor", "No neural network required", "Eliminates need for exploration", "Reduced variance and better sample efficiency" ], "correctAnswerIndex": 3, "explanation": "The critic estimates value function to guide the actor, reducing variance compared to vanilla policy gradient." }, { "id": 49, "questionText": "In DRL, why is gradient clipping applied?", "options": [ "Increase discount factor", "Prevent exploding gradients and stabilize learning", "Store experiences", "Normalize inputs" ], "correctAnswerIndex": 1, "explanation": "Clipping gradient magnitudes ensures neural network weights do not change abruptly, preventing instability." }, { "id": 50, "questionText": "Which DRL algorithm can handle both discrete and continuous action spaces with separate actor and critic networks?", "options": [ "SARSA only", "Actor-Critic / DDPG", "REINFORCE only", "DQN only" ], "correctAnswerIndex": 1, "explanation": "Actor-Critic methods and DDPG separate policy and value networks, allowing application in both discrete and continuous domains." }, { "id": 51, "questionText": "What is the main idea behind Advantage Actor-Critic (A2C)?", "options": [ "Store experiences for replay", "Ignore policy updates", "Only use the critic for evaluation", "Use the advantage function to reduce variance in policy updates" ], "correctAnswerIndex": 3, "explanation": "Advantage function improves learning stability by comparing action value against expected value for the state." }, { "id": 52, "questionText": "In DRL, what is a major issue with partial observability?", "options": [ "Discount factor cannot be applied", "Q-values are ignored", "Agent does not have full knowledge of the environment, making decision-making harder", "Rewards become deterministic" ], "correctAnswerIndex": 2, "explanation": "Partial observability requires the agent to infer hidden state information, often handled with RNNs." }, { "id": 53, "questionText": "Which technique improves exploration in continuous action DRL algorithms?", "options": [ "Clipping rewards", "Use deterministic greedy policy only", "Add noise (e.g., Ornstein-Uhlenbeck process in DDPG) to actor outputs", "Reducing discount factor" ], "correctAnswerIndex": 2, "explanation": "Adding noise ensures the agent explores diverse actions in continuous spaces." }, { "id": 54, "questionText": "What is the purpose of soft updates in target networks?", "options": [ "Increase exploration", "Smoothly update target network parameters to improve stability", "Ignore experience replay", "Clip rewards" ], "correctAnswerIndex": 1, "explanation": "Soft updates prevent large jumps in Q-value targets, stabilizing training." }, { "id": 55, "questionText": "Which DRL algorithm is particularly suitable for large discrete action spaces?", "options": [ "DDPG", "Dueling DQN", "SARSA only", "Policy Gradient only" ], "correctAnswerIndex": 1, "explanation": "Dueling DQN separates state-value and advantage function, allowing efficient learning in large discrete action spaces." }, { "id": 56, "questionText": "Why is advantage function useful in policy gradient methods?", "options": [ "Removes need for rewards", "Eliminates discount factor", "Reduces variance without introducing bias", "Stores experiences" ], "correctAnswerIndex": 2, "explanation": "By comparing action value to baseline, variance in gradient estimates decreases, improving stability." }, { "id": 57, "questionText": "In DRL, what is entropy regularization?", "options": [ "Reduce rewards", "Encourage exploration by adding entropy of the policy to the loss function", "Clips gradients", "Store experience replay" ], "correctAnswerIndex": 1, "explanation": "Entropy regularization prevents premature convergence to deterministic policies, encouraging exploration." }, { "id": 58, "questionText": "Which neural network is used to handle sequences in partially observable DRL tasks?", "options": [ "Decision trees", "Fully connected networks", "Recurrent Neural Networks (RNNs)", "CNNs only" ], "correctAnswerIndex": 2, "explanation": "RNNs maintain hidden states over time, allowing the agent to infer information from past observations." }, { "id": 59, "questionText": "Why is target network in DQN updated periodically?", "options": [ "Clip gradients", "Reduce oscillations and stabilize learning", "Increase rewards", "Reduce exploration" ], "correctAnswerIndex": 1, "explanation": "Periodic updates provide fixed targets for several steps, preventing divergence." }, { "id": 60, "questionText": "What is the main difference between DDPG and DQN?", "options": [ "DQN uses actor-critic; DDPG does not", "DDPG handles continuous actions; DQN handles discrete actions", "DQN is on-policy", "DDPG requires tabular Q-table" ], "correctAnswerIndex": 1, "explanation": "DDPG uses actor-critic for continuous actions, while DQN uses Q-value approximations for discrete actions." }, { "id": 61, "questionText": "What is the role of the critic in Actor-Critic methods?", "options": [ "Estimate value function to evaluate actions", "Normalize inputs", "Select random actions", "Clip rewards" ], "correctAnswerIndex": 0, "explanation": "The critic evaluates the policy by providing feedback on the quality of actions, guiding the actor." }, { "id": 62, "questionText": "Which DRL method is designed for multi-agent asynchronous training?", "options": [ "DQN", "SARSA", "DDPG", "A3C" ], "correctAnswerIndex": 3, "explanation": "A3C uses multiple agents training in parallel, improving efficiency and stability." }, { "id": 63, "questionText": "Which approach addresses overestimation in Q-values in DRL?", "options": [ "Double DQN", "Actor-Critic", "Policy gradient", "DQN only" ], "correctAnswerIndex": 0, "explanation": "Double DQN separates selection and evaluation, reducing overestimation bias in Q-learning." }, { "id": 64, "questionText": "Which DRL algorithm uses deterministic policy for continuous control?", "options": [ "DQN", "A2C", "DDPG", "REINFORCE" ], "correctAnswerIndex": 2, "explanation": "DDPG outputs deterministic actions from the actor network, suitable for continuous action environments." }, { "id": 65, "questionText": "Why is reward shaping useful in DRL?", "options": [ "Eliminates exploration", "Removes discount factor", "Provides intermediate rewards to accelerate learning", "Stores experiences" ], "correctAnswerIndex": 2, "explanation": "Shaping rewards gives the agent feedback on progress towards goals, improving convergence speed." }, { "id": 66, "questionText": "Which technique reduces variance in policy gradient methods?", "options": [ "Using advantage function or baseline", "Increasing learning rate", "Reducing discount factor", "Clipping rewards" ], "correctAnswerIndex": 0, "explanation": "Subtracting a baseline from the return reduces variance while keeping gradient estimates unbiased." }, { "id": 67, "questionText": "In DRL, why is exploration important?", "options": [ "Ignore rewards", "Store experiences", "Ensure agent discovers optimal actions rather than exploiting suboptimal known actions", "Reduce discount factor" ], "correctAnswerIndex": 2, "explanation": "Exploration allows the agent to learn about the environment and avoid getting stuck in local optima." }, { "id": 68, "questionText": "Which problem does partial observability introduce in DRL?", "options": [ "Learning rate becomes zero", "Rewards become deterministic", "Agent cannot fully observe the environment state, making decision-making harder", "Discount factor is ignored" ], "correctAnswerIndex": 2, "explanation": "Partial observability requires the agent to maintain internal memory or inference to act effectively." }, { "id": 69, "questionText": "Which DRL algorithm is on-policy?", "options": [ "DQN", "A2C", "Double DQN", "DDPG" ], "correctAnswerIndex": 1, "explanation": "A2C updates the policy based on actions actually taken, making it on-policy." }, { "id": 70, "questionText": "Why is entropy regularization used in policy gradient DRL?", "options": [ "Reduce rewards", "Encourage exploration by preventing premature convergence to deterministic policies", "Clip gradients", "Normalize inputs" ], "correctAnswerIndex": 1, "explanation": "Entropy regularization adds a term to the loss to favor higher-entropy (more exploratory) policies." }, { "id": 71, "questionText": "A robot using DDPG in a continuous action space keeps colliding with obstacles. What is the best approach?", "options": [ "Reduce discount factor to zero", "Use greedy deterministic policy only", "Modify the reward function to penalize collisions heavily", "Ignore collisions and continue training" ], "correctAnswerIndex": 2, "explanation": "Reward shaping helps the agent learn safer actions while maintaining exploration." }, { "id": 72, "questionText": "A DRL agent trained with DQN in a stochastic environment overestimates Q-values. What modification can help?", "options": [ "Use Double DQN to separate action selection and evaluation", "Use on-policy updates only", "Ignore rewards", "Increase learning rate drastically" ], "correctAnswerIndex": 0, "explanation": "Double DQN mitigates overestimation by using separate networks for selection and evaluation." }, { "id": 73, "questionText": "During training, a DRL agent’s policy oscillates and does not converge. What is a likely cause?", "options": [ "High variance in policy gradients or unstable target updates", "No experience replay used", "Discount factor too low", "Low rewards" ], "correctAnswerIndex": 0, "explanation": "High variance and unstable updates can cause oscillations; techniques like advantage function or target smoothing help stabilize learning." }, { "id": 74, "questionText": "A multi-agent DRL environment suffers from slow learning. Which approach can improve training efficiency?", "options": [ "Ignore rewards", "Reduce network size drastically", "Use A3C with multiple asynchronous agents", "Use deterministic greedy policy only" ], "correctAnswerIndex": 2, "explanation": "Asynchronous agents explore in parallel, speeding up learning and stabilizing convergence." }, { "id": 75, "questionText": "An agent using policy gradients receives sparse rewards, making learning slow. How can this be mitigated?", "options": [ "Reduce discount factor", "Apply reward shaping to provide intermediate feedback", "Ignore sparse rewards", "Use deterministic actions only" ], "correctAnswerIndex": 1, "explanation": "Reward shaping provides more frequent signals to accelerate learning in sparse-reward environments." }, { "id": 76, "questionText": "During DRL training with continuous actions, exploration is insufficient. What should be done?", "options": [ "Set discount factor to zero", "Remove reward signals", "Add noise (e.g., Ornstein-Uhlenbeck) to actor outputs", "Use only greedy policy" ], "correctAnswerIndex": 2, "explanation": "Adding noise ensures exploration in continuous action spaces, helping the agent discover better policies." }, { "id": 77, "questionText": "A DRL agent trained with DQN is unstable and diverging. Which technique can stabilize training?", "options": [ "Set discount factor to zero", "Ignore rewards", "Use target networks and experience replay", "Reduce network capacity drastically" ], "correctAnswerIndex": 2, "explanation": "Target networks and experience replay break correlations and provide stable Q-value targets, improving convergence." }, { "id": 78, "questionText": "An agent using Actor-Critic has slow convergence due to high gradient variance. What is a solution?", "options": [ "Remove critic network", "Ignore rewards", "Use advantage function or baseline to reduce variance", "Increase discount factor to 1" ], "correctAnswerIndex": 2, "explanation": "Advantage function compares action value to expected state value, reducing variance without biasing updates." }, { "id": 79, "questionText": "A DRL agent trained in partially observable environment fails to act optimally. Which method can help?", "options": [ "Use Recurrent Neural Networks to maintain internal memory", "Remove actor network", "Use DQN only", "Increase discount factor to 1" ], "correctAnswerIndex": 0, "explanation": "RNNs allow the agent to remember past observations, improving decisions under partial observability." }, { "id": 80, "questionText": "During continuous control DRL, Q-values fluctuate wildly. What can help stabilize learning?", "options": [ "Soft updates of target networks and smaller learning rates", "Ignore rewards", "Remove exploration noise", "Reduce discount factor to zero" ], "correctAnswerIndex": 0, "explanation": "Soft target updates and cautious learning rates prevent large oscillations in value estimates." }, { "id": 81, "questionText": "A DRL agent in a robotics task learns slowly due to sparse reward signals. What technique can accelerate learning?", "options": [ "Reduce learning rate to zero", "Remove critic network", "Apply reward shaping with intermediate rewards", "Increase discount factor to 1.0" ], "correctAnswerIndex": 2, "explanation": "Reward shaping provides denser feedback, helping the agent learn meaningful behaviors faster." }, { "id": 82, "questionText": "In a stochastic environment, a DQN agent overestimates some Q-values. Which approach helps?", "options": [ "Use on-policy updates only", "Use Double DQN to decouple selection and evaluation", "Ignore replay buffer", "Reduce discount factor to zero" ], "correctAnswerIndex": 1, "explanation": "Double DQN reduces overestimation bias by separating action selection and Q-value evaluation." }, { "id": 83, "questionText": "An agent using DDPG shows poor exploration. What is the most effective solution?", "options": [ "Use deterministic greedy policy", "Reduce discount factor", "Add temporally correlated noise to the actor actions", "Ignore reward signals" ], "correctAnswerIndex": 2, "explanation": "Temporally correlated noise (e.g., Ornstein-Uhlenbeck) encourages effective exploration in continuous action spaces." }, { "id": 84, "questionText": "During training, a policy gradient agent exhibits high variance. What strategy reduces it?", "options": [ "Remove reward signals", "Subtract a baseline or use advantage function", "Reduce discount factor to zero", "Increase learning rate drastically" ], "correctAnswerIndex": 1, "explanation": "Using a baseline reduces the variance of gradient estimates while maintaining unbiased updates." }, { "id": 85, "questionText": "A partially observable DRL environment prevents the agent from seeing the full state. What is the solution?", "options": [ "Increase learning rate", "Use RNNs or LSTMs to retain past observations", "Remove reward shaping", "Use DQN only" ], "correctAnswerIndex": 1, "explanation": "RNNs or LSTMs provide memory of past states, allowing better decision-making despite partial observability." }, { "id": 86, "questionText": "In multi-agent DRL, agents’ policies interfere with each other, causing instability. Which method can help?", "options": [ "Reduce discount factor to zero", "Use independent learning or centralized training with decentralized execution", "Ignore rewards", "Remove actor network" ], "correctAnswerIndex": 1, "explanation": "Centralized training stabilizes learning by considering other agents’ actions while still allowing decentralized execution." }, { "id": 87, "questionText": "An agent’s policy converges to suboptimal deterministic behavior too early. Which method encourages exploration?", "options": [ "Remove critic network", "Ignore reward shaping", "Increase discount factor to 1", "Add entropy regularization to the loss function" ], "correctAnswerIndex": 3, "explanation": "Entropy regularization encourages stochastic actions, preventing premature convergence." }, { "id": 88, "questionText": "During DRL training, target Q-values fluctuate wildly causing instability. Which adjustment helps?", "options": [ "Use soft updates for target networks", "Use deterministic actions only", "Remove replay buffer", "Reduce reward magnitude to zero" ], "correctAnswerIndex": 0, "explanation": "Soft updates reduce sudden changes in target Q-values, stabilizing training." }, { "id": 89, "questionText": "An agent trained in a sparse reward environment fails to discover optimal behavior. What can help?", "options": [ "Introduce shaped or auxiliary rewards for intermediate goals", "Remove actor network", "Reduce learning rate to zero", "Use deterministic greedy policy" ], "correctAnswerIndex": 0, "explanation": "Shaped rewards provide more frequent feedback, helping the agent learn useful behaviors." }, { "id": 90, "questionText": "During training, a continuous control DRL agent oscillates near optimal policy. What adjustment helps?", "options": [ "Reduce learning rate and apply soft target updates", "Reduce discount factor to zero", "Remove actor network", "Ignore reward signals" ], "correctAnswerIndex": 0, "explanation": "Small learning rates and soft target updates prevent large weight changes, reducing oscillations." }, { "id": 91, "questionText": "A robotic arm using DDPG reaches the target inconsistently. Which technique can improve stability?", "options": [ "Use target smoothing and reward shaping", "Ignore experience replay", "Reduce discount factor", "Remove actor network" ], "correctAnswerIndex": 0, "explanation": "Target smoothing stabilizes Q-value estimates, and reward shaping guides the agent towards correct behavior." }, { "id": 92, "questionText": "A DRL agent in a stochastic maze overestimates Q-values. What solution helps?", "options": [ "Reduce exploration", "Use Double DQN", "Ignore rewards", "Use deterministic greedy policy" ], "correctAnswerIndex": 1, "explanation": "Double DQN mitigates overestimation by decoupling action selection from evaluation." }, { "id": 93, "questionText": "An agent shows slow learning due to correlated sequential samples. Which technique helps?", "options": [ "Ignore rewards", "Experience replay with random sampling", "Reduce discount factor", "Remove critic network" ], "correctAnswerIndex": 1, "explanation": "Random sampling from replay memory breaks temporal correlations, improving stability and convergence." }, { "id": 94, "questionText": "In a partially observable environment, an agent fails to infer state. Which method can help?", "options": [ "Increase learning rate", "Use DQN only", "Remove reward shaping", "Use RNNs or LSTMs to encode history" ], "correctAnswerIndex": 3, "explanation": "RNNs or LSTMs maintain memory of past observations, allowing better state inference." }, { "id": 95, "questionText": "An agent trained in continuous control fails to explore. Which solution improves performance?", "options": [ "Use deterministic policy only", "Reduce discount factor", "Add temporally correlated noise to actions", "Ignore reward shaping" ], "correctAnswerIndex": 2, "explanation": "Temporally correlated noise encourages exploration in continuous action spaces." }, { "id": 96, "questionText": "During DRL training, an agent converges to a suboptimal deterministic policy. How to improve?", "options": [ "Add entropy regularization to encourage stochasticity", "Use DQN only", "Reduce learning rate", "Ignore rewards" ], "correctAnswerIndex": 0, "explanation": "Entropy regularization prevents premature convergence to deterministic policies, encouraging exploration." }, { "id": 97, "questionText": "A DRL agent trained in a high-dimensional visual environment struggles. Which network helps?", "options": [ "Decision trees", "RNNs only", "Convolutional Neural Networks (CNNs)", "Fully connected networks only" ], "correctAnswerIndex": 2, "explanation": "CNNs extract spatial features from images, enabling learning in complex visual environments." }, { "id": 98, "questionText": "An agent’s Q-values explode during training in continuous control. What helps?", "options": [ "Use deterministic greedy policy only", "Remove actor network", "Gradient clipping and smaller learning rates", "Increase rewards drastically" ], "correctAnswerIndex": 2, "explanation": "Gradient clipping prevents large updates that destabilize learning in DRL networks." }, { "id": 99, "questionText": "In a multi-agent environment, agents’ interactions destabilize learning. What can help?", "options": [ "Centralized training with decentralized execution", "Remove actor network", "Reduce discount factor to zero", "Ignore rewards" ], "correctAnswerIndex": 0, "explanation": "Centralized training considers interactions, while decentralized execution allows individual agents to act independently." }, { "id": 100, "questionText": "A robotic agent using DRL performs poorly after transferring from simulation to real world. What can help?", "options": [ "Remove actor network", "Use deterministic greedy policy", "Domain randomization and fine-tuning in real environment", "Reduce discount factor to zero" ], "correctAnswerIndex": 2, "explanation": "Domain randomization improves robustness to variations, and fine-tuning adapts the policy to real-world dynamics." } ] }