{
  "title": "Deep Reinforcement Learning Mastery: 100 MCQs",
  "description": "A comprehensive set of 100 multiple-choice questions to test and deepen your understanding of Deep Reinforcement Learning, from basic concepts to advanced topics like Deep Q-Networks, Policy Gradients, and Actor-Critic methods.",
  "questions": [
    {
      "id": 1,
      "questionText": "What is the main difference between standard Q-Learning and Deep Q-Learning?",
      "options": [
        "Q-Learning ignores rewards",
        "Deep Q-Learning uses a neural network to approximate Q-values",
        "Q-Learning uses continuous actions",
        "Deep Q-Learning requires supervised labels"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Deep Q-Learning replaces the Q-table with a neural network to handle large or continuous state spaces."
    },
    {
      "id": 2,
      "questionText": "Which problem does function approximation in Deep Reinforcement Learning solve?",
      "options": [
        "Exploration vs. exploitation",
        "Handling large or continuous state spaces",
        "Reward shaping",
        "Reducing discount factor"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Function approximation allows the agent to generalize Q-values across many states instead of storing a table."
    },
    {
      "id": 3,
      "questionText": "In Deep Q-Networks (DQN), what is the purpose of the target network?",
      "options": [
        "Generate random actions",
        "Provide rewards",
        "Stabilize learning by providing fixed Q-value targets periodically",
        "Replace policy network completely"
      ],
      "correctAnswerIndex": 2,
      "explanation": "The target network reduces oscillations by keeping Q-value targets fixed for several updates before copying from the main network."
    },
    {
      "id": 4,
      "questionText": "What is experience replay in DRL?",
      "options": [
        "Resetting the environment",
        "Storing past experiences and sampling randomly for training",
        "Recording rewards only",
        "Using supervised labels"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Experience replay breaks correlation between sequential data and improves learning stability by training on randomly sampled past experiences."
    },
    {
      "id": 5,
      "questionText": "Which activation function is commonly used in DRL networks?",
      "options": [
        "Linear only",
        "ReLU",
        "Tanh only",
        "Sigmoid only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "ReLU is commonly used due to its efficiency and ability to reduce vanishing gradient problems."
    },
    {
      "id": 6,
      "questionText": "In DRL, what is the role of the discount factor γ?",
      "options": [
        "Control neural network learning rate",
        "Weigh future rewards relative to immediate rewards",
        "Scale input features",
        "Select actions randomly"
      ],
      "correctAnswerIndex": 1,
      "explanation": "The discount factor balances the importance of immediate versus future rewards in value estimation."
    },
    {
      "id": 7,
      "questionText": "Why is Q-learning considered off-policy?",
      "options": [
        "It uses supervised labels",
        "It updates values using the best next action, not necessarily the action taken",
        "It follows a fixed policy only",
        "It ignores rewards"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Off-policy learning uses the optimal action for updating Q-values regardless of the agent’s actual behavior policy."
    },
    {
      "id": 8,
      "questionText": "Which problem does target network in DQN help to mitigate?",
      "options": [
        "Exploration failure",
        "Instability due to moving Q-value targets",
        "Sparse rewards",
        "Reward hacking"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Fixing Q-value targets for several steps reduces oscillations and divergence in neural network training."
    },
    {
      "id": 9,
      "questionText": "In DRL, why is experience replay preferred over online updates?",
      "options": [
        "Requires supervised data",
        "Reduces correlation between consecutive samples",
        "Only works with deterministic environments",
        "Avoids using discount factor"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Sampling random experiences helps the network learn more effectively from diverse states and actions."
    },
    {
      "id": 10,
      "questionText": "What is a primary advantage of Deep Q-Networks over tabular Q-Learning?",
      "options": [
        "Eliminates exploration",
        "Removes the need for discount factor",
        "Can handle high-dimensional or continuous state spaces",
        "Requires fewer rewards"
      ],
      "correctAnswerIndex": 2,
      "explanation": "DQN can generalize across large state spaces using neural networks instead of storing Q-values in a table."
    },
    {
      "id": 11,
      "questionText": "Which of the following is a common DRL benchmark environment?",
      "options": [
        "ImageNet",
        "MNIST",
        "OpenAI Gym",
        "COCO"
      ],
      "correctAnswerIndex": 2,
      "explanation": "OpenAI Gym provides standardized environments for testing DRL algorithms."
    },
    {
      "id": 12,
      "questionText": "Why is gradient clipping used in DRL?",
      "options": [
        "Reduce discount factor",
        "Increase rewards artificially",
        "Control exploration rate",
        "Prevent exploding gradients during neural network training"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Gradient clipping limits extreme weight updates, improving stability."
    },
    {
      "id": 13,
      "questionText": "Which is true about the replay buffer size in DQN?",
      "options": [
        "Size does not affect learning",
        "Larger buffers store more diverse experiences but use more memory",
        "Buffers store only rewards",
        "Small buffers always converge faster"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Larger buffers provide better sample diversity, improving learning stability."
    },
    {
      "id": 14,
      "questionText": "In DRL, what is the main challenge with continuous action spaces?",
      "options": [
        "Exploration is unnecessary",
        "Rewards cannot be used",
        "Q-Learning requires discrete actions; approximation methods like DDPG are needed",
        "Discount factor cannot be applied"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Discrete action Q-learning cannot directly handle continuous actions; actor-critic methods or policy gradients are used."
    },
    {
      "id": 15,
      "questionText": "What is the purpose of a policy network in policy gradient methods?",
      "options": [
        "Estimates Q-values",
        "Directly outputs action probabilities for a given state",
        "Stores experiences",
        "Generates rewards"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Policy networks map states to action probabilities, allowing stochastic or deterministic policies."
    },
    {
      "id": 16,
      "questionText": "Which DRL method is considered on-policy?",
      "options": [
        "Double Q-Learning",
        "Experience Replay only",
        "Deep Q-Network (DQN)",
        "REINFORCE"
      ],
      "correctAnswerIndex": 3,
      "explanation": "REINFORCE updates the policy based on actions actually taken by the agent, making it on-policy."
    },
    {
      "id": 17,
      "questionText": "Which type of neural network is commonly used in DRL for image inputs?",
      "options": [
        "Recurrent Neural Networks (RNNs) only",
        "Fully connected only",
        "Autoencoders only",
        "Convolutional Neural Networks (CNNs)"
      ],
      "correctAnswerIndex": 3,
      "explanation": "CNNs extract spatial features from images for DRL agents like in Atari games."
    },
    {
      "id": 18,
      "questionText": "In DRL, what is a major issue with high variance in policy gradient methods?",
      "options": [
        "Q-values are ignored",
        "Exploration becomes deterministic",
        "Slow and unstable learning",
        "Rewards become negative"
      ],
      "correctAnswerIndex": 2,
      "explanation": "High variance can make gradient updates noisy, slowing convergence."
    },
    {
      "id": 19,
      "questionText": "Which technique reduces variance in policy gradient updates?",
      "options": [
        "Use a baseline or advantage function",
        "Use greedy policy",
        "Increase discount factor",
        "Ignore rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Subtracting a baseline (like state value) reduces variance while keeping the estimate unbiased."
    },
    {
      "id": 20,
      "questionText": "What is the main advantage of Actor-Critic methods over DQN?",
      "options": [
        "Eliminates exploration",
        "Requires tabular Q-table",
        "Can handle continuous actions and reduce variance with a value estimator",
        "Removes the need for rewards"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Actor-Critic combines policy learning (actor) with value estimation (critic) for better performance, especially in continuous domains."
    },
    {
      "id": 21,
      "questionText": "Which DRL algorithm is suitable for continuous action control?",
      "options": [
        "DQN",
        "SARSA",
        "Deep Deterministic Policy Gradient (DDPG)",
        "Monte Carlo only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "DDPG can output continuous actions using an actor network and learn value using a critic network."
    },
    {
      "id": 22,
      "questionText": "Why are target networks important in DRL?",
      "options": [
        "Store experience replay",
        "Control exploration",
        "Generate rewards",
        "Stabilize training by reducing oscillations in Q-value targets"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Without a target network, the moving Q-value targets cause instability during neural network updates."
    },
    {
      "id": 23,
      "questionText": "What is a common solution for partially observable environments in DRL?",
      "options": [
        "Use Recurrent Neural Networks (RNNs) to remember past states",
        "Use tabular Q-Learning",
        "Increase discount factor",
        "Ignore history"
      ],
      "correctAnswerIndex": 0,
      "explanation": "RNNs allow the agent to maintain an internal state, improving decisions in partially observable settings."
    },
    {
      "id": 24,
      "questionText": "Which method combines policy gradients and value estimation for stability?",
      "options": [
        "Monte Carlo only",
        "SARSA only",
        "Advantage Actor-Critic (A2C)",
        "DQN only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "A2C uses a critic to estimate value and an actor to update policy, reducing variance and improving learning."
    },
    {
      "id": 25,
      "questionText": "In DRL, what is the purpose of epsilon-greedy policy?",
      "options": [
        "Balance exploration and exploitation",
        "Reduce network size",
        "Ignore rewards",
        "Stabilize gradients"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Epsilon-greedy chooses random actions with probability ε to explore the environment while mostly exploiting the best-known action."
    },
    {
      "id": 26,
      "questionText": "Which optimization algorithm is commonly used to train DRL networks?",
      "options": [
        "SGD only",
        "Adam",
        "None",
        "RMSProp only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Adam combines momentum and adaptive learning rates, making it effective for DRL training."
    },
    {
      "id": 27,
      "questionText": "What is reward shaping in DRL?",
      "options": [
        "Changing discount factor",
        "Modifying the reward signal to provide intermediate feedback",
        "Removing rewards",
        "Randomizing actions"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Reward shaping provides more frequent feedback to accelerate learning while keeping the optimal policy unchanged."
    },
    {
      "id": 28,
      "questionText": "Why is clipping rewards sometimes used in DRL?",
      "options": [
        "Increase exploration",
        "Reduce network size",
        "Prevent large gradients and stabilize training",
        "Ignore rewards"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Clipping avoids extremely large reward signals that can destabilize learning."
    },
    {
      "id": 29,
      "questionText": "Which DRL method directly models a stochastic policy?",
      "options": [
        "Policy Gradient (REINFORCE)",
        "DQN",
        "SARSA",
        "Double Q-Learning"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Policy gradients learn a probability distribution over actions, allowing stochastic action selection."
    },
    {
      "id": 30,
      "questionText": "In DRL, why is it important to normalize inputs?",
      "options": [
        "Q-values become zero",
        "Rewards are ignored",
        "Exploration is unnecessary",
        "Neural network training is more stable and faster"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Normalized inputs prevent large-scale differences that can hinder learning and slow convergence."
    },
    {
      "id": 31,
      "questionText": "What is the key idea behind Double DQN?",
      "options": [
        "Use two policies to explore the environment",
        "Combine policy gradient with Q-learning",
        "Update Q-values twice per step",
        "Use one network for action selection and another for evaluation to reduce overestimation"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Double DQN separates action selection and Q-value evaluation to mitigate overestimation bias seen in standard DQN."
    },
    {
      "id": 32,
      "questionText": "What is the purpose of prioritized experience replay?",
      "options": [
        "Sample important experiences more frequently to improve learning efficiency",
        "Ignore old experiences",
        "Store only positive rewards",
        "Replay experiences in sequential order"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Prioritized replay focuses learning on transitions with higher temporal-difference errors, improving convergence speed."
    },
    {
      "id": 33,
      "questionText": "Which DRL algorithm is suitable for continuous control tasks?",
      "options": [
        "DQN",
        "Deep Deterministic Policy Gradient (DDPG)",
        "Q-Learning",
        "SARSA"
      ],
      "correctAnswerIndex": 1,
      "explanation": "DDPG can handle continuous action spaces using an actor network to output continuous actions and a critic network to estimate values."
    },
    {
      "id": 34,
      "questionText": "In Actor-Critic methods, what is the role of the critic?",
      "options": [
        "Modify rewards",
        "Estimate the value function to guide the actor",
        "Store replay memory",
        "Select actions randomly"
      ],
      "correctAnswerIndex": 1,
      "explanation": "The critic evaluates the current policy by estimating the expected return, providing feedback to the actor for policy improvement."
    },
    {
      "id": 35,
      "questionText": "Which advantage does A3C (Asynchronous Advantage Actor-Critic) provide over standard Actor-Critic?",
      "options": [
        "Removes the need for value estimation",
        "Eliminates exploration",
        "Uses only one agent to reduce computation",
        "Parallel training with multiple agents to stabilize learning"
      ],
      "correctAnswerIndex": 3,
      "explanation": "A3C uses multiple asynchronous agents exploring in parallel, which stabilizes learning and improves convergence speed."
    },
    {
      "id": 36,
      "questionText": "What is the main challenge of high-dimensional state spaces in DRL?",
      "options": [
        "Rewards become negative",
        "Learning rate α becomes zero",
        "Discount factor is ignored",
        "Curse of dimensionality increases sample complexity"
      ],
      "correctAnswerIndex": 3,
      "explanation": "High-dimensional inputs require more data to learn effective policies and can slow convergence."
    },
    {
      "id": 37,
      "questionText": "Which technique helps DRL agents learn from visual input effectively?",
      "options": [
        "RNNs only",
        "Convolutional Neural Networks (CNNs)",
        "Decision trees",
        "Fully connected networks only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "CNNs extract spatial features from images, enabling DRL agents to handle complex visual environments."
    },
    {
      "id": 38,
      "questionText": "In DDPG, why is it necessary to add noise to actions during training?",
      "options": [
        "Promote exploration in continuous action spaces",
        "Stabilize the target network",
        "Reduce rewards",
        "Increase discount factor"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Exploration is crucial in continuous action spaces; adding noise ensures the agent explores various actions."
    },
    {
      "id": 39,
      "questionText": "What is the purpose of advantage function in A2C or A3C?",
      "options": [
        "Reduce variance in policy gradient updates",
        "Ignore state values",
        "Store experiences",
        "Increase rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Advantage function measures how much better an action is compared to the expected value, reducing variance in updates."
    },
    {
      "id": 40,
      "questionText": "Which of the following is a major limitation of vanilla policy gradients?",
      "options": [
        "Cannot handle discrete actions",
        "Requires tabular Q-table",
        "High variance in gradient estimates",
        "Ignores rewards"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Vanilla policy gradients have high variance, making learning slow and unstable."
    },
    {
      "id": 41,
      "questionText": "Why is normalization of input features important in DRL?",
      "options": [
        "Increases rewards artificially",
        "Stabilizes neural network training and improves convergence",
        "Reduces exploration",
        "Removes discount factor"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Normalization prevents large-scale differences that could destabilize learning and slow down convergence."
    },
    {
      "id": 42,
      "questionText": "In DRL, what is the role of target smoothing in DDPG?",
      "options": [
        "Prevent oscillations by slowly updating target networks",
        "Ignore discount factor",
        "Randomize actions",
        "Generate rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Soft updates of the target network improve training stability by avoiding large sudden changes in Q-values."
    },
    {
      "id": 43,
      "questionText": "What does the term 'on-policy' mean in DRL?",
      "options": [
        "Agent stores experiences only",
        "Agent ignores rewards",
        "Agent uses a separate policy for evaluation",
        "Agent updates policy using actions it actually takes"
      ],
      "correctAnswerIndex": 3,
      "explanation": "On-policy methods learn the value of the policy being executed, unlike off-policy methods which can learn from other policies."
    },
    {
      "id": 44,
      "questionText": "What does 'off-policy' learning in DRL allow?",
      "options": [
        "Ignoring rewards",
        "Reducing discount factor to zero",
        "Only learning from current policy",
        "Learning optimal policy using experiences from a different behavior policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Off-policy learning allows using past experiences or exploratory actions to learn the optimal policy."
    },
    {
      "id": 45,
      "questionText": "Which method is used to reduce correlation between consecutive samples in DRL?",
      "options": [
        "Target networks only",
        "Policy gradient",
        "Greedy policy",
        "Experience replay"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Experience replay randomly samples past experiences, breaking temporal correlations and improving learning stability."
    },
    {
      "id": 46,
      "questionText": "Which DRL algorithm is suitable for environments with discrete action spaces?",
      "options": [
        "DDPG",
        "Policy Gradient with continuous actor",
        "SARSA only",
        "Deep Q-Network (DQN)"
      ],
      "correctAnswerIndex": 3,
      "explanation": "DQN works well in discrete action spaces by estimating Q-values for all possible actions."
    },
    {
      "id": 47,
      "questionText": "Why is reward clipping sometimes applied in DRL?",
      "options": [
        "Normalize inputs",
        "Prevent very large rewards from destabilizing training",
        "Increase exploration",
        "Reduce discount factor"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Clipping rewards prevents extreme updates in the network that could destabilize learning."
    },
    {
      "id": 48,
      "questionText": "What is the main advantage of using Actor-Critic over pure policy gradients?",
      "options": [
        "Removes discount factor",
        "No neural network required",
        "Eliminates need for exploration",
        "Reduced variance and better sample efficiency"
      ],
      "correctAnswerIndex": 3,
      "explanation": "The critic estimates value function to guide the actor, reducing variance compared to vanilla policy gradient."
    },
    {
      "id": 49,
      "questionText": "In DRL, why is gradient clipping applied?",
      "options": [
        "Increase discount factor",
        "Prevent exploding gradients and stabilize learning",
        "Store experiences",
        "Normalize inputs"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Clipping gradient magnitudes ensures neural network weights do not change abruptly, preventing instability."
    },
    {
      "id": 50,
      "questionText": "Which DRL algorithm can handle both discrete and continuous action spaces with separate actor and critic networks?",
      "options": [
        "SARSA only",
        "Actor-Critic / DDPG",
        "REINFORCE only",
        "DQN only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Actor-Critic methods and DDPG separate policy and value networks, allowing application in both discrete and continuous domains."
    },
    {
      "id": 51,
      "questionText": "What is the main idea behind Advantage Actor-Critic (A2C)?",
      "options": [
        "Store experiences for replay",
        "Ignore policy updates",
        "Only use the critic for evaluation",
        "Use the advantage function to reduce variance in policy updates"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Advantage function improves learning stability by comparing action value against expected value for the state."
    },
    {
      "id": 52,
      "questionText": "In DRL, what is a major issue with partial observability?",
      "options": [
        "Discount factor cannot be applied",
        "Q-values are ignored",
        "Agent does not have full knowledge of the environment, making decision-making harder",
        "Rewards become deterministic"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Partial observability requires the agent to infer hidden state information, often handled with RNNs."
    },
    {
      "id": 53,
      "questionText": "Which technique improves exploration in continuous action DRL algorithms?",
      "options": [
        "Clipping rewards",
        "Use deterministic greedy policy only",
        "Add noise (e.g., Ornstein-Uhlenbeck process in DDPG) to actor outputs",
        "Reducing discount factor"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Adding noise ensures the agent explores diverse actions in continuous spaces."
    },
    {
      "id": 54,
      "questionText": "What is the purpose of soft updates in target networks?",
      "options": [
        "Increase exploration",
        "Smoothly update target network parameters to improve stability",
        "Ignore experience replay",
        "Clip rewards"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Soft updates prevent large jumps in Q-value targets, stabilizing training."
    },
    {
      "id": 55,
      "questionText": "Which DRL algorithm is particularly suitable for large discrete action spaces?",
      "options": [
        "DDPG",
        "Dueling DQN",
        "SARSA only",
        "Policy Gradient only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Dueling DQN separates state-value and advantage function, allowing efficient learning in large discrete action spaces."
    },
    {
      "id": 56,
      "questionText": "Why is advantage function useful in policy gradient methods?",
      "options": [
        "Removes need for rewards",
        "Eliminates discount factor",
        "Reduces variance without introducing bias",
        "Stores experiences"
      ],
      "correctAnswerIndex": 2,
      "explanation": "By comparing action value to baseline, variance in gradient estimates decreases, improving stability."
    },
    {
      "id": 57,
      "questionText": "In DRL, what is entropy regularization?",
      "options": [
        "Reduce rewards",
        "Encourage exploration by adding entropy of the policy to the loss function",
        "Clips gradients",
        "Store experience replay"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Entropy regularization prevents premature convergence to deterministic policies, encouraging exploration."
    },
    {
      "id": 58,
      "questionText": "Which neural network is used to handle sequences in partially observable DRL tasks?",
      "options": [
        "Decision trees",
        "Fully connected networks",
        "Recurrent Neural Networks (RNNs)",
        "CNNs only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "RNNs maintain hidden states over time, allowing the agent to infer information from past observations."
    },
    {
      "id": 59,
      "questionText": "Why is target network in DQN updated periodically?",
      "options": [
        "Clip gradients",
        "Reduce oscillations and stabilize learning",
        "Increase rewards",
        "Reduce exploration"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Periodic updates provide fixed targets for several steps, preventing divergence."
    },
    {
      "id": 60,
      "questionText": "What is the main difference between DDPG and DQN?",
      "options": [
        "DQN uses actor-critic; DDPG does not",
        "DDPG handles continuous actions; DQN handles discrete actions",
        "DQN is on-policy",
        "DDPG requires tabular Q-table"
      ],
      "correctAnswerIndex": 1,
      "explanation": "DDPG uses actor-critic for continuous actions, while DQN uses Q-value approximations for discrete actions."
    },
    {
      "id": 61,
      "questionText": "What is the role of the critic in Actor-Critic methods?",
      "options": [
        "Estimate value function to evaluate actions",
        "Normalize inputs",
        "Select random actions",
        "Clip rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "The critic evaluates the policy by providing feedback on the quality of actions, guiding the actor."
    },
    {
      "id": 62,
      "questionText": "Which DRL method is designed for multi-agent asynchronous training?",
      "options": [
        "DQN",
        "SARSA",
        "DDPG",
        "A3C"
      ],
      "correctAnswerIndex": 3,
      "explanation": "A3C uses multiple agents training in parallel, improving efficiency and stability."
    },
    {
      "id": 63,
      "questionText": "Which approach addresses overestimation in Q-values in DRL?",
      "options": [
        "Double DQN",
        "Actor-Critic",
        "Policy gradient",
        "DQN only"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Double DQN separates selection and evaluation, reducing overestimation bias in Q-learning."
    },
    {
      "id": 64,
      "questionText": "Which DRL algorithm uses deterministic policy for continuous control?",
      "options": [
        "DQN",
        "A2C",
        "DDPG",
        "REINFORCE"
      ],
      "correctAnswerIndex": 2,
      "explanation": "DDPG outputs deterministic actions from the actor network, suitable for continuous action environments."
    },
    {
      "id": 65,
      "questionText": "Why is reward shaping useful in DRL?",
      "options": [
        "Eliminates exploration",
        "Removes discount factor",
        "Provides intermediate rewards to accelerate learning",
        "Stores experiences"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Shaping rewards gives the agent feedback on progress towards goals, improving convergence speed."
    },
    {
      "id": 66,
      "questionText": "Which technique reduces variance in policy gradient methods?",
      "options": [
        "Using advantage function or baseline",
        "Increasing learning rate",
        "Reducing discount factor",
        "Clipping rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Subtracting a baseline from the return reduces variance while keeping gradient estimates unbiased."
    },
    {
      "id": 67,
      "questionText": "In DRL, why is exploration important?",
      "options": [
        "Ignore rewards",
        "Store experiences",
        "Ensure agent discovers optimal actions rather than exploiting suboptimal known actions",
        "Reduce discount factor"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Exploration allows the agent to learn about the environment and avoid getting stuck in local optima."
    },
    {
      "id": 68,
      "questionText": "Which problem does partial observability introduce in DRL?",
      "options": [
        "Learning rate becomes zero",
        "Rewards become deterministic",
        "Agent cannot fully observe the environment state, making decision-making harder",
        "Discount factor is ignored"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Partial observability requires the agent to maintain internal memory or inference to act effectively."
    },
    {
      "id": 69,
      "questionText": "Which DRL algorithm is on-policy?",
      "options": [
        "DQN",
        "A2C",
        "Double DQN",
        "DDPG"
      ],
      "correctAnswerIndex": 1,
      "explanation": "A2C updates the policy based on actions actually taken, making it on-policy."
    },
    {
      "id": 70,
      "questionText": "Why is entropy regularization used in policy gradient DRL?",
      "options": [
        "Reduce rewards",
        "Encourage exploration by preventing premature convergence to deterministic policies",
        "Clip gradients",
        "Normalize inputs"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Entropy regularization adds a term to the loss to favor higher-entropy (more exploratory) policies."
    },
    {
      "id": 71,
      "questionText": "A robot using DDPG in a continuous action space keeps colliding with obstacles. What is the best approach?",
      "options": [
        "Reduce discount factor to zero",
        "Use greedy deterministic policy only",
        "Modify the reward function to penalize collisions heavily",
        "Ignore collisions and continue training"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Reward shaping helps the agent learn safer actions while maintaining exploration."
    },
    {
      "id": 72,
      "questionText": "A DRL agent trained with DQN in a stochastic environment overestimates Q-values. What modification can help?",
      "options": [
        "Use Double DQN to separate action selection and evaluation",
        "Use on-policy updates only",
        "Ignore rewards",
        "Increase learning rate drastically"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Double DQN mitigates overestimation by using separate networks for selection and evaluation."
    },
    {
      "id": 73,
      "questionText": "During training, a DRL agent’s policy oscillates and does not converge. What is a likely cause?",
      "options": [
        "High variance in policy gradients or unstable target updates",
        "No experience replay used",
        "Discount factor too low",
        "Low rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "High variance and unstable updates can cause oscillations; techniques like advantage function or target smoothing help stabilize learning."
    },
    {
      "id": 74,
      "questionText": "A multi-agent DRL environment suffers from slow learning. Which approach can improve training efficiency?",
      "options": [
        "Ignore rewards",
        "Reduce network size drastically",
        "Use A3C with multiple asynchronous agents",
        "Use deterministic greedy policy only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Asynchronous agents explore in parallel, speeding up learning and stabilizing convergence."
    },
    {
      "id": 75,
      "questionText": "An agent using policy gradients receives sparse rewards, making learning slow. How can this be mitigated?",
      "options": [
        "Reduce discount factor",
        "Apply reward shaping to provide intermediate feedback",
        "Ignore sparse rewards",
        "Use deterministic actions only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Reward shaping provides more frequent signals to accelerate learning in sparse-reward environments."
    },
    {
      "id": 76,
      "questionText": "During DRL training with continuous actions, exploration is insufficient. What should be done?",
      "options": [
        "Set discount factor to zero",
        "Remove reward signals",
        "Add noise (e.g., Ornstein-Uhlenbeck) to actor outputs",
        "Use only greedy policy"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Adding noise ensures exploration in continuous action spaces, helping the agent discover better policies."
    },
    {
      "id": 77,
      "questionText": "A DRL agent trained with DQN is unstable and diverging. Which technique can stabilize training?",
      "options": [
        "Set discount factor to zero",
        "Ignore rewards",
        "Use target networks and experience replay",
        "Reduce network capacity drastically"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Target networks and experience replay break correlations and provide stable Q-value targets, improving convergence."
    },
    {
      "id": 78,
      "questionText": "An agent using Actor-Critic has slow convergence due to high gradient variance. What is a solution?",
      "options": [
        "Remove critic network",
        "Ignore rewards",
        "Use advantage function or baseline to reduce variance",
        "Increase discount factor to 1"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Advantage function compares action value to expected state value, reducing variance without biasing updates."
    },
    {
      "id": 79,
      "questionText": "A DRL agent trained in partially observable environment fails to act optimally. Which method can help?",
      "options": [
        "Use Recurrent Neural Networks to maintain internal memory",
        "Remove actor network",
        "Use DQN only",
        "Increase discount factor to 1"
      ],
      "correctAnswerIndex": 0,
      "explanation": "RNNs allow the agent to remember past observations, improving decisions under partial observability."
    },
    {
      "id": 80,
      "questionText": "During continuous control DRL, Q-values fluctuate wildly. What can help stabilize learning?",
      "options": [
        "Soft updates of target networks and smaller learning rates",
        "Ignore rewards",
        "Remove exploration noise",
        "Reduce discount factor to zero"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Soft target updates and cautious learning rates prevent large oscillations in value estimates."
    },
    {
      "id": 81,
      "questionText": "A DRL agent in a robotics task learns slowly due to sparse reward signals. What technique can accelerate learning?",
      "options": [
        "Reduce learning rate to zero",
        "Remove critic network",
        "Apply reward shaping with intermediate rewards",
        "Increase discount factor to 1.0"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Reward shaping provides denser feedback, helping the agent learn meaningful behaviors faster."
    },
    {
      "id": 82,
      "questionText": "In a stochastic environment, a DQN agent overestimates some Q-values. Which approach helps?",
      "options": [
        "Use on-policy updates only",
        "Use Double DQN to decouple selection and evaluation",
        "Ignore replay buffer",
        "Reduce discount factor to zero"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Double DQN reduces overestimation bias by separating action selection and Q-value evaluation."
    },
    {
      "id": 83,
      "questionText": "An agent using DDPG shows poor exploration. What is the most effective solution?",
      "options": [
        "Use deterministic greedy policy",
        "Reduce discount factor",
        "Add temporally correlated noise to the actor actions",
        "Ignore reward signals"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Temporally correlated noise (e.g., Ornstein-Uhlenbeck) encourages effective exploration in continuous action spaces."
    },
    {
      "id": 84,
      "questionText": "During training, a policy gradient agent exhibits high variance. What strategy reduces it?",
      "options": [
        "Remove reward signals",
        "Subtract a baseline or use advantage function",
        "Reduce discount factor to zero",
        "Increase learning rate drastically"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Using a baseline reduces the variance of gradient estimates while maintaining unbiased updates."
    },
    {
      "id": 85,
      "questionText": "A partially observable DRL environment prevents the agent from seeing the full state. What is the solution?",
      "options": [
        "Increase learning rate",
        "Use RNNs or LSTMs to retain past observations",
        "Remove reward shaping",
        "Use DQN only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "RNNs or LSTMs provide memory of past states, allowing better decision-making despite partial observability."
    },
    {
      "id": 86,
      "questionText": "In multi-agent DRL, agents’ policies interfere with each other, causing instability. Which method can help?",
      "options": [
        "Reduce discount factor to zero",
        "Use independent learning or centralized training with decentralized execution",
        "Ignore rewards",
        "Remove actor network"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Centralized training stabilizes learning by considering other agents’ actions while still allowing decentralized execution."
    },
    {
      "id": 87,
      "questionText": "An agent’s policy converges to suboptimal deterministic behavior too early. Which method encourages exploration?",
      "options": [
        "Remove critic network",
        "Ignore reward shaping",
        "Increase discount factor to 1",
        "Add entropy regularization to the loss function"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Entropy regularization encourages stochastic actions, preventing premature convergence."
    },
    {
      "id": 88,
      "questionText": "During DRL training, target Q-values fluctuate wildly causing instability. Which adjustment helps?",
      "options": [
        "Use soft updates for target networks",
        "Use deterministic actions only",
        "Remove replay buffer",
        "Reduce reward magnitude to zero"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Soft updates reduce sudden changes in target Q-values, stabilizing training."
    },
    {
      "id": 89,
      "questionText": "An agent trained in a sparse reward environment fails to discover optimal behavior. What can help?",
      "options": [
        "Introduce shaped or auxiliary rewards for intermediate goals",
        "Remove actor network",
        "Reduce learning rate to zero",
        "Use deterministic greedy policy"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Shaped rewards provide more frequent feedback, helping the agent learn useful behaviors."
    },
    {
      "id": 90,
      "questionText": "During training, a continuous control DRL agent oscillates near optimal policy. What adjustment helps?",
      "options": [
        "Reduce learning rate and apply soft target updates",
        "Reduce discount factor to zero",
        "Remove actor network",
        "Ignore reward signals"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Small learning rates and soft target updates prevent large weight changes, reducing oscillations."
    },
    {
      "id": 91,
      "questionText": "A robotic arm using DDPG reaches the target inconsistently. Which technique can improve stability?",
      "options": [
        "Use target smoothing and reward shaping",
        "Ignore experience replay",
        "Reduce discount factor",
        "Remove actor network"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Target smoothing stabilizes Q-value estimates, and reward shaping guides the agent towards correct behavior."
    },
    {
      "id": 92,
      "questionText": "A DRL agent in a stochastic maze overestimates Q-values. What solution helps?",
      "options": [
        "Reduce exploration",
        "Use Double DQN",
        "Ignore rewards",
        "Use deterministic greedy policy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Double DQN mitigates overestimation by decoupling action selection from evaluation."
    },
    {
      "id": 93,
      "questionText": "An agent shows slow learning due to correlated sequential samples. Which technique helps?",
      "options": [
        "Ignore rewards",
        "Experience replay with random sampling",
        "Reduce discount factor",
        "Remove critic network"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Random sampling from replay memory breaks temporal correlations, improving stability and convergence."
    },
    {
      "id": 94,
      "questionText": "In a partially observable environment, an agent fails to infer state. Which method can help?",
      "options": [
        "Increase learning rate",
        "Use DQN only",
        "Remove reward shaping",
        "Use RNNs or LSTMs to encode history"
      ],
      "correctAnswerIndex": 3,
      "explanation": "RNNs or LSTMs maintain memory of past observations, allowing better state inference."
    },
    {
      "id": 95,
      "questionText": "An agent trained in continuous control fails to explore. Which solution improves performance?",
      "options": [
        "Use deterministic policy only",
        "Reduce discount factor",
        "Add temporally correlated noise to actions",
        "Ignore reward shaping"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Temporally correlated noise encourages exploration in continuous action spaces."
    },
    {
      "id": 96,
      "questionText": "During DRL training, an agent converges to a suboptimal deterministic policy. How to improve?",
      "options": [
        "Add entropy regularization to encourage stochasticity",
        "Use DQN only",
        "Reduce learning rate",
        "Ignore rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Entropy regularization prevents premature convergence to deterministic policies, encouraging exploration."
    },
    {
      "id": 97,
      "questionText": "A DRL agent trained in a high-dimensional visual environment struggles. Which network helps?",
      "options": [
        "Decision trees",
        "RNNs only",
        "Convolutional Neural Networks (CNNs)",
        "Fully connected networks only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "CNNs extract spatial features from images, enabling learning in complex visual environments."
    },
    {
      "id": 98,
      "questionText": "An agent’s Q-values explode during training in continuous control. What helps?",
      "options": [
        "Use deterministic greedy policy only",
        "Remove actor network",
        "Gradient clipping and smaller learning rates",
        "Increase rewards drastically"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Gradient clipping prevents large updates that destabilize learning in DRL networks."
    },
    {
      "id": 99,
      "questionText": "In a multi-agent environment, agents’ interactions destabilize learning. What can help?",
      "options": [
        "Centralized training with decentralized execution",
        "Remove actor network",
        "Reduce discount factor to zero",
        "Ignore rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Centralized training considers interactions, while decentralized execution allows individual agents to act independently."
    },
    {
      "id": 100,
      "questionText": "A robotic agent using DRL performs poorly after transferring from simulation to real world. What can help?",
      "options": [
        "Remove actor network",
        "Use deterministic greedy policy",
        "Domain randomization and fine-tuning in real environment",
        "Reduce discount factor to zero"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Domain randomization improves robustness to variations, and fine-tuning adapts the policy to real-world dynamics."
    }
  ]
}