{
  "title": "Reinforcement Learning Mastery: 100 MCQs",
  "description": "A comprehensive set of 100 multiple-choice questions on Reinforcement Learning, covering rewards, value functions, and core algorithms.",
  "questions": [
    {
      "id": 1,
      "questionText": "In reinforcement learning, what is the reward?",
      "options": [
        "A vector representing all possible actions",
        "A deterministic sequence of states",
        "A scalar feedback signal indicating how good the last action was",
        "The final goal of the environment"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Reward is the immediate scalar feedback from the environment that tells the agent how good its action was."
    },
    {
      "id": 2,
      "questionText": "The cumulative sum of future rewards is called:",
      "options": [
        "Transition probability",
        "Value function",
        "State space",
        "Policy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Value function estimates the expected total (cumulative) reward an agent can get from a state or state-action pair."
    },
    {
      "id": 3,
      "questionText": "Which term represents immediate reward at time t?",
      "options": [
        "s_t",
        "v_t",
        "π_t",
        "r_t"
      ],
      "correctAnswerIndex": 3,
      "explanation": "r_t denotes the reward received at the current time step t."
    },
    {
      "id": 4,
      "questionText": "Which of the following is TRUE about the value function V(s)?",
      "options": [
        "It measures reward only at the next step",
        "It gives expected cumulative reward starting from state s",
        "It is a policy-independent constant",
        "It directly outputs the best action"
      ],
      "correctAnswerIndex": 1,
      "explanation": "V(s) estimates the expected sum of future rewards starting from state s following a policy π."
    },
    {
      "id": 5,
      "questionText": "Discount factor γ is used to:",
      "options": [
        "Ignore past rewards",
        "Increase the reward infinitely",
        "Weight future rewards less than immediate rewards",
        "Randomize state transitions"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Discount factor 0 ≤ γ ≤ 1 ensures future rewards are worth less than immediate ones."
    },
    {
      "id": 6,
      "questionText": "Q(s, a) represents:",
      "options": [
        "Probability of next state",
        "Policy mapping",
        "Value of taking action a in state s",
        "Immediate reward only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Q-function measures expected cumulative reward when taking action a in state s and then following policy π."
    },
    {
      "id": 7,
      "questionText": "The difference between expected reward and actual reward is called:",
      "options": [
        "Discount factor",
        "Greedy error",
        "Temporal Difference (TD) error",
        "Policy gradient"
      ],
      "correctAnswerIndex": 2,
      "explanation": "TD error δ = r + γV(s') − V(s) measures how much the predicted value differs from observed reward."
    },
    {
      "id": 8,
      "questionText": "Immediate reward is:",
      "options": [
        "A policy parameter",
        "Sum of all future rewards",
        "The feedback obtained right after an action",
        "Probability of action success"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Immediate reward is the feedback signal received immediately after taking an action in a state."
    },
    {
      "id": 9,
      "questionText": "Which function tells the value of a state under a policy π?",
      "options": [
        "Q-value function Q(s,a)",
        "Reward function R(s)",
        "State value function V(s)",
        "Transition function T(s,a)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "V(s) gives expected cumulative reward starting from state s under policy π."
    },
    {
      "id": 10,
      "questionText": "Which function evaluates both state and action pair?",
      "options": [
        "V-value function V(s)",
        "Discount function γ",
        "Q-value function Q(s, a)",
        "Reward function R(s)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Q(s,a) evaluates expected cumulative reward for taking action a in state s and then following policy π."
    },
    {
      "id": 11,
      "questionText": "What is the purpose of a reward function R(s,a)?",
      "options": [
        "To define environment dynamics",
        "To store past transitions",
        "To map states to actions deterministically",
        "To provide feedback to agent about quality of actions"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Reward function defines the immediate payoff received by the agent for taking an action in a state."
    },
    {
      "id": 12,
      "questionText": "Which value function is policy-specific?",
      "options": [
        "R(s,a)",
        "V*(s)",
        "Q*(s,a)",
        "Vπ(s)"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Vπ(s) depends on the specific policy π being followed."
    },
    {
      "id": 13,
      "questionText": "What is the difference between V(s) and Q(s,a)?",
      "options": [
        "V(s) considers only state; Q(s,a) considers state-action pair",
        "They are identical",
        "V(s) is deterministic; Q(s,a) is random",
        "V(s) gives immediate reward; Q(s,a) gives discounted reward"
      ],
      "correctAnswerIndex": 0,
      "explanation": "V(s) measures value of a state; Q(s,a) measures value of taking a specific action in that state."
    },
    {
      "id": 14,
      "questionText": "If γ=0 in RL, the agent:",
      "options": [
        "Considers only immediate rewards",
        "Maximizes long-term reward",
        "Ignores rewards completely",
        "Considers all future rewards equally"
      ],
      "correctAnswerIndex": 0,
      "explanation": "γ=0 makes the agent short-sighted, focusing only on immediate reward."
    },
    {
      "id": 15,
      "questionText": "If γ approaches 1, the agent:",
      "options": [
        "Stops learning",
        "Values future rewards almost as much as immediate rewards",
        "Ignores future rewards",
        "Becomes random"
      ],
      "correctAnswerIndex": 1,
      "explanation": "High γ makes the agent far-sighted, considering long-term consequences."
    },
    {
      "id": 16,
      "questionText": "Which formula defines TD learning update for value function?",
      "options": [
        "V(s) ← r only",
        "Q(s,a) ← r + γmax Q(s',a')",
        "V(s) ← γ V(s')",
        "V(s) ← V(s) + α[r + γV(s') − V(s)]"
      ],
      "correctAnswerIndex": 3,
      "explanation": "TD update modifies V(s) toward observed reward plus discounted next state value."
    },
    {
      "id": 17,
      "questionText": "In RL, reward shaping is used to:",
      "options": [
        "Provide additional intermediate rewards to guide learning",
        "Simplify environment dynamics",
        "Randomize action selection",
        "Remove future rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reward shaping helps the agent learn faster by providing informative intermediate feedback."
    },
    {
      "id": 18,
      "questionText": "Expected cumulative reward starting from state s and following policy π is:",
      "options": [
        "Q*(s,a)",
        "R(s)",
        "Vπ(s)",
        "γ(s)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Vπ(s) is the expected sum of discounted rewards under policy π starting at state s."
    },
    {
      "id": 19,
      "questionText": "Which reward type encourages agent to achieve long-term goal?",
      "options": [
        "Random reward",
        "Immediate reward only",
        "Negative reward only",
        "Sparse reward"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Sparse or delayed rewards push the agent to consider long-term strategy."
    },
    {
      "id": 20,
      "questionText": "Which function gives the best achievable expected reward from a state?",
      "options": [
        "Immediate reward function R(s)",
        "Optimal value function V*(s)",
        "Qπ(s,a)",
        "Policy function π(s)"
      ],
      "correctAnswerIndex": 1,
      "explanation": "V*(s) represents the maximum expected cumulative reward achievable from state s."
    },
    {
      "id": 21,
      "questionText": "Q*(s,a) represents:",
      "options": [
        "Discount factor",
        "Maximum expected reward for taking action a in state s and following optimal policy",
        "Immediate reward only",
        "Transition probability"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Q*(s,a) estimates the optimal expected return for a specific state-action pair."
    },
    {
      "id": 22,
      "questionText": "If the reward function is poorly designed, the agent may:",
      "options": [
        "Ignore environment",
        "Learn undesired behavior",
        "Increase exploration automatically",
        "Immediately converge to optimal policy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Incorrect reward leads to reward hacking — agent may maximize reward in unintended ways."
    },
    {
      "id": 23,
      "questionText": "Discounted future reward is calculated as:",
      "options": [
        "γ only",
        "r_t only",
        "r_t + γ r_{t+1} + γ^2 r_{t+2} + …",
        "Sum of unweighted rewards"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Discounted sum reduces importance of rewards further in the future using γ."
    },
    {
      "id": 24,
      "questionText": "What is the purpose of Q-learning?",
      "options": [
        "To generate random actions",
        "To directly update policy probabilities",
        "To learn the optimal action-value function",
        "To compute rewards only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Q-learning seeks to learn Q*(s,a) — the optimal expected cumulative reward function."
    },
    {
      "id": 25,
      "questionText": "Monte Carlo methods estimate value function using:",
      "options": [
        "TD error",
        "Actual returns from complete episodes",
        "Policy gradient",
        "Random rewards"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Monte Carlo calculates V(s) or Q(s,a) using the sum of rewards observed in full episodes."
    },
    {
      "id": 26,
      "questionText": "Bootstrapping in value function estimation refers to:",
      "options": [
        "Resetting environment every step",
        "Estimating current value using future estimated values",
        "Using only random actions",
        "Ignoring future rewards"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Bootstrapping updates estimates using other current estimates (e.g., TD learning)."
    },
    {
      "id": 27,
      "questionText": "Which method combines bootstrapping and Monte Carlo ideas for value estimation?",
      "options": [
        "SARSA",
        "Q-learning",
        "TD(λ) learning",
        "Policy gradient"
      ],
      "correctAnswerIndex": 2,
      "explanation": "TD(λ) uses λ parameter to mix Monte Carlo and TD bootstrapping for more stable learning."
    },
    {
      "id": 28,
      "questionText": "What does SARSA stand for?",
      "options": [
        "Stochastic-Action-Reward-State-Algorithm",
        "State-Action-Reward-State-Action",
        "Supervised-Action-Reward-State-Agent",
        "State-Action-Reward-Sequence-Approximation"
      ],
      "correctAnswerIndex": 1,
      "explanation": "SARSA updates Q-values using the current state, action, reward, next state, and next action."
    },
    {
      "id": 29,
      "questionText": "Which of the following is TRUE about Q-learning?",
      "options": [
        "It only works for deterministic environments",
        "It is on-policy and depends on agent’s current behavior",
        "It is off-policy and learns the optimal Q regardless of agent’s actions",
        "It ignores rewards completely"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Q-learning is off-policy: it learns Q*(s,a) while following a different policy for action selection."
    },
    {
      "id": 30,
      "questionText": "Which parameter balances importance of immediate vs future rewards?",
      "options": [
        "Reward function R",
        "Exploration rate ε",
        "Learning rate α",
        "Discount factor γ"
      ],
      "correctAnswerIndex": 3,
      "explanation": "γ determines how much future rewards contribute to current value estimates."
    },
    {
      "id": 31,
      "questionText": "A sparse reward environment means:",
      "options": [
        "Rewards are continuous and immediate",
        "Rewards are given infrequently, usually only on goal completion",
        "All states give the same reward",
        "Rewards are always negative"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Sparse reward settings give feedback rarely, making learning more challenging."
    },
    {
      "id": 32,
      "questionText": "In value-based RL, what is the primary goal of the agent?",
      "options": [
        "Minimize immediate reward",
        "Maximize cumulative discounted reward",
        "Randomly explore environment",
        "Reduce state space"
      ],
      "correctAnswerIndex": 1,
      "explanation": "The agent selects actions that maximize expected cumulative rewards over time."
    },
    {
      "id": 33,
      "questionText": "What is the Bellman equation for V(s)?",
      "options": [
        "V(s) = γ^t * r_t",
        "V(s) = E[r + γV(s’)]",
        "V(s) = r only",
        "V(s) = max Q(s,a)"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Bellman equation expresses value as immediate reward plus discounted expected value of next state."
    },
    {
      "id": 34,
      "questionText": "Which function represents long-term expected reward from taking a specific action?",
      "options": [
        "V(s)",
        "γ(s)",
        "R(s)",
        "Q(s,a)"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Q(s,a) evaluates cumulative reward starting with a specific action."
    },
    {
      "id": 35,
      "questionText": "Which function estimates the maximum reward achievable from state s?",
      "options": [
        "Qπ(s,a)",
        "Vπ(s)",
        "V*(s)",
        "R(s)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "V*(s) is the optimal value function representing maximum achievable reward."
    },
    {
      "id": 36,
      "questionText": "Temporal difference learning updates value estimates using:",
      "options": [
        "Observed reward + estimated value of next state",
        "Random guesses",
        "Policy gradient",
        "Only immediate reward"
      ],
      "correctAnswerIndex": 0,
      "explanation": "TD uses bootstrapping: V(s) ← V(s) + α[r + γV(s') − V(s)]."
    },
    {
      "id": 37,
      "questionText": "Which approach requires full episodes to update values?",
      "options": [
        "TD learning",
        "Monte Carlo",
        "SARSA",
        "Q-learning"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Monte Carlo estimates values based on actual returns from complete episodes."
    },
    {
      "id": 38,
      "questionText": "Reward shaping is beneficial because it:",
      "options": [
        "Eliminates exploration",
        "Guarantees deterministic policy",
        "Removes the discount factor",
        "Speeds up learning by giving intermediate rewards"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Reward shaping provides guidance to the agent via extra signals."
    },
    {
      "id": 39,
      "questionText": "Which of these is a disadvantage of sparse rewards?",
      "options": [
        "Reward scaling issues",
        "Immediate overfitting",
        "Slower convergence and learning difficulty",
        "Exploration elimination"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Sparse rewards provide limited feedback, making learning slower and exploration harder."
    },
    {
      "id": 40,
      "questionText": "Which RL method learns directly from Q-values without policy?",
      "options": [
        "Monte Carlo policy evaluation",
        "Value-based methods (e.g., Q-learning)",
        "Actor-Critic",
        "Policy gradient"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Value-based methods estimate Q-values and derive actions via max(Q) instead of learning policy directly."
    },
    {
      "id": 41,
      "questionText": "The TD error δ = r + γV(s') − V(s) is used to:",
      "options": [
        "Update value estimates incrementally",
        "Determine next action",
        "Select best policy directly",
        "Compute discount factor"
      ],
      "correctAnswerIndex": 0,
      "explanation": "TD error measures prediction discrepancy to adjust value function gradually."
    },
    {
      "id": 42,
      "questionText": "Why is Q*(s,a) considered optimal?",
      "options": [
        "It gives immediate reward",
        "It ignores state transitions",
        "It represents maximum expected reward achievable by any policy",
        "It is randomly assigned"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Q* provides the best action-value estimates regardless of current policy."
    },
    {
      "id": 43,
      "questionText": "Which concept allows estimating future rewards without waiting for episode completion?",
      "options": [
        "Reward clipping",
        "Monte Carlo",
        "Sparse reward",
        "Bootstrapping (TD learning)"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Bootstrapping updates values using estimates of next state instead of waiting for full episode."
    },
    {
      "id": 44,
      "questionText": "A discount factor γ close to 0 leads to:",
      "options": [
        "Far-sighted agent",
        "Infinite reward accumulation",
        "Short-sighted agent focusing on immediate rewards",
        "Random action selection"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Low γ reduces the weight of future rewards in value estimates."
    },
    {
      "id": 45,
      "questionText": "A discount factor γ close to 1 leads to:",
      "options": [
        "Far-sighted agent valuing future rewards",
        "No learning",
        "Randomized reward",
        "Immediate reward focus"
      ],
      "correctAnswerIndex": 0,
      "explanation": "High γ makes the agent long-term focused, considering distant rewards."
    },
    {
      "id": 46,
      "questionText": "Which function guides agent behavior by evaluating future reward potential?",
      "options": [
        "Reward function only",
        "State-action mapping",
        "Value function",
        "Transition function"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Value functions estimate future reward potential, indirectly guiding actions."
    },
    {
      "id": 47,
      "questionText": "Which method combines state and action evaluation to choose optimal moves?",
      "options": [
        "TD(0) only",
        "Q-function",
        "V-function",
        "Monte Carlo only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Q(s,a) evaluates expected return for state-action pairs."
    },
    {
      "id": 48,
      "questionText": "Which term measures the quality of an action in a state?",
      "options": [
        "γ",
        "Reward shaping",
        "Q-value",
        "V-value"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Q-value estimates long-term expected reward for taking a specific action."
    },
    {
      "id": 49,
      "questionText": "Value function approximation is necessary when:",
      "options": [
        "Actions are discrete",
        "State space is small",
        "Rewards are deterministic",
        "State space is too large or continuous"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Large or continuous state spaces make tabular value storage impractical."
    },
    {
      "id": 50,
      "questionText": "Which method learns policy indirectly via value estimates?",
      "options": [
        "Actor-Critic only",
        "Value-based RL",
        "Monte Carlo only",
        "Policy gradient"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Value-based methods choose actions via max(Q) without learning policy parameters directly."
    },
    {
      "id": 51,
      "questionText": "In a deterministic environment, TD(0) converges to:",
      "options": [
        "Immediate rewards only",
        "Random values",
        "True state values V(s)",
        "Policy parameters"
      ],
      "correctAnswerIndex": 2,
      "explanation": "TD(0) converges to correct V(s) if learning rate and exploration conditions are met."
    },
    {
      "id": 52,
      "questionText": "Bootstrapping can introduce bias but reduces:",
      "options": [
        "Variance in estimates",
        "Immediate rewards",
        "Policy randomness",
        "Learning rate"
      ],
      "correctAnswerIndex": 0,
      "explanation": "TD bootstrapping reduces variance at the cost of some bias."
    },
    {
      "id": 53,
      "questionText": "The max operator in Q-learning helps:",
      "options": [
        "Compute TD error only",
        "Discount rewards",
        "Randomize exploration",
        "Choose action with highest estimated return"
      ],
      "correctAnswerIndex": 3,
      "explanation": "max_a Q(s’,a) selects the action with highest expected value for next state."
    },
    {
      "id": 54,
      "questionText": "Q-learning is considered off-policy because:",
      "options": [
        "It learns optimal Q regardless of agent’s current actions",
        "It uses random rewards only",
        "It ignores state transitions",
        "It directly follows current policy"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Off-policy learning allows learning of Q* while following exploratory policy."
    },
    {
      "id": 55,
      "questionText": "Which function provides guidance for immediate action selection?",
      "options": [
        "V(s)",
        "Reward function",
        "Discount factor",
        "Q(s,a)"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Q-values indicate which action in current state yields highest expected reward."
    },
    {
      "id": 56,
      "questionText": "Monte Carlo updates are unbiased but have:",
      "options": [
        "High variance",
        "Immediate convergence",
        "No error",
        "Low variance"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Monte Carlo estimates can vary widely between episodes, leading to high variance."
    },
    {
      "id": 57,
      "questionText": "Which value function is used in policy iteration to evaluate policy?",
      "options": [
        "Q*(s,a)",
        "R(s)",
        "Vπ(s)",
        "V*(s)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Policy evaluation uses Vπ(s) to estimate expected return under policy π."
    },
    {
      "id": 58,
      "questionText": "Temporal difference methods combine Monte Carlo ideas and:",
      "options": [
        "Policy gradients",
        "Reward clipping",
        "Bootstrapping",
        "Random exploration"
      ],
      "correctAnswerIndex": 2,
      "explanation": "TD methods use bootstrapping to estimate value based on next state’s current value."
    },
    {
      "id": 59,
      "questionText": "Sparse rewards make RL more challenging because:",
      "options": [
        "Policy gradient fails",
        "Agent receives little guidance during learning",
        "Discount factor becomes irrelevant",
        "Agent converges immediately"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Without frequent feedback, the agent struggles to learn correct action-value mapping."
    },
    {
      "id": 60,
      "questionText": "Which term describes expected future reward from a state-action pair?",
      "options": [
        "V(s)",
        "R(s)",
        "Q(s,a)",
        "γ"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Q(s,a) measures cumulative expected reward starting from that action."
    },
    {
      "id": 61,
      "questionText": "Which method updates value functions continuously after every step?",
      "options": [
        "Monte Carlo",
        "Reward shaping",
        "TD learning",
        "Policy gradient"
      ],
      "correctAnswerIndex": 2,
      "explanation": "TD learning updates V(s) incrementally using observed reward and next state value."
    },
    {
      "id": 62,
      "questionText": "Which value function guides long-term planning in RL?",
      "options": [
        "Policy entropy",
        "Reward only",
        "Immediate next state",
        "V(s) and Q(s,a)"
      ],
      "correctAnswerIndex": 3,
      "explanation": "V(s) and Q(s,a) provide estimates of cumulative future reward for planning actions."
    },
    {
      "id": 63,
      "questionText": "Which is true about bootstrapped TD updates?",
      "options": [
        "They are only for deterministic environments",
        "They ignore discount factor",
        "They reduce variance compared to Monte Carlo",
        "They eliminate reward function"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Bootstrapping reduces variance but introduces bias, unlike full-episode Monte Carlo."
    },
    {
      "id": 64,
      "questionText": "Which parameter determines learning step size in TD updates?",
      "options": [
        "γ (discount factor)",
        "ε (exploration)",
        "α (learning rate)",
        "λ (trace decay)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "α controls how much each update adjusts the current value estimate."
    },
    {
      "id": 65,
      "questionText": "Which function represents optimal action-value function?",
      "options": [
        "Vπ(s)",
        "Q*(s,a)",
        "R(s)",
        "V*(s)"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Q*(s,a) gives the best achievable return for a state-action pair following optimal policy."
    },
    {
      "id": 66,
      "questionText": "Which scenario illustrates reward hacking?",
      "options": [
        "Agent stops learning",
        "Agent explores randomly",
        "Agent finds shortcut to maximize reward but violates task intention",
        "Agent follows optimal policy"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Reward hacking occurs when agent exploits unintended loopholes in reward function."
    },
    {
      "id": 67,
      "questionText": "Which function is used to derive greedy action selection?",
      "options": [
        "Q(s,a)",
        "V(s)",
        "R(s)",
        "γ"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Greedy selection picks action with maximum Q-value in current state."
    },
    {
      "id": 68,
      "questionText": "Which parameter λ in TD(λ) balances:",
      "options": [
        "Exploration vs exploitation",
        "Monte Carlo vs TD updates",
        "Immediate vs sparse reward",
        "Learning rate vs discount factor"
      ],
      "correctAnswerIndex": 1,
      "explanation": "λ mixes short-term TD updates with long-term Monte Carlo returns."
    },
    {
      "id": 69,
      "questionText": "Why are value function approximators needed in large environments?",
      "options": [
        "State space too large for tabular methods",
        "Discount factor irrelevant",
        "Policy gradients fail",
        "Rewards are deterministic"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Function approximation allows generalization when storing values for every state is impossible."
    },
    {
      "id": 70,
      "questionText": "Which function measures discrepancy between predicted and observed reward?",
      "options": [
        "γ",
        "Q-value",
        "TD error δ",
        "V(s)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "TD error δ = r + γV(s') − V(s) indicates prediction mismatch for updating values."
    },
    {
      "id": 71,
      "questionText": "An agent consistently receives +1 reward only at goal completion. This is an example of:",
      "options": [
        "Dense reward",
        "Negative reward",
        "Shaped reward",
        "Sparse reward"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Sparse reward occurs when feedback is only given at task completion."
    },
    {
      "id": 72,
      "questionText": "If Q(s,a) underestimates future rewards, the agent may:",
      "options": [
        "Avoid valuable actions",
        "Ignore discount factor",
        "Converge instantly",
        "Overexplore"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Underestimated Q-values mislead agent to ignore actions with high actual returns."
    },
    {
      "id": 73,
      "questionText": "In episodic tasks, value function returns are calculated until:",
      "options": [
        "First reward",
        "Episode ends",
        "Discount factor γ=0",
        "Next action"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Episodic tasks compute total return from start state until terminal state."
    },
    {
      "id": 74,
      "questionText": "Expected reward from a state following policy π is given by:",
      "options": [
        "γ",
        "Vπ(s)",
        "R(s)",
        "Q*(s,a)"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Vπ(s) = E[Σ γ^t r_t | s, π] is the formal definition."
    },
    {
      "id": 75,
      "questionText": "Q-learning update formula is:",
      "options": [
        "V(s) ← r only",
        "Policy π(s) ← π(s) + α",
        "Q(s,a) ← Q(s,a) + α[r + γ max Q(s’,a’) − Q(s,a)]",
        "TD error δ = r − V(s)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Q-learning uses max Q of next state to update current action value."
    },
    {
      "id": 76,
      "questionText": "Which factor encourages exploration in value-based methods?",
      "options": [
        "TD error δ",
        "ε-greedy policy",
        "Discount factor γ",
        "Learning rate α"
      ],
      "correctAnswerIndex": 1,
      "explanation": "ε-greedy policy selects random actions with small probability to explore new states."
    },
    {
      "id": 77,
      "questionText": "Which method estimates Q(s,a) while following the same policy?",
      "options": [
        "Monte Carlo",
        "SARSA (on-policy)",
        "TD(λ)",
        "Q-learning (off-policy)"
      ],
      "correctAnswerIndex": 1,
      "explanation": "SARSA uses next action chosen by current policy for updates."
    },
    {
      "id": 78,
      "questionText": "Which technique combines immediate and future reward estimation in TD learning?",
      "options": [
        "Monte Carlo only",
        "Bootstrapping",
        "Random policy",
        "Greedy selection"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Bootstrapping blends observed reward with estimated next state value."
    },
    {
      "id": 79,
      "questionText": "Which value function provides the highest possible expected return?",
      "options": [
        "Immediate reward function R(s)",
        "Policy-specific Vπ(s)",
        "TD error δ",
        "Optimal value function V*(s)"
      ],
      "correctAnswerIndex": 3,
      "explanation": "V*(s) represents maximum expected cumulative reward from state s."
    },
    {
      "id": 80,
      "questionText": "Reward shaping helps RL agent by:",
      "options": [
        "Giving intermediate rewards to guide learning",
        "Eliminating exploration entirely",
        "Forcing deterministic actions",
        "Changing discount factor"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Shaped rewards provide additional feedback to accelerate learning."
    },
    {
      "id": 81,
      "questionText": "An agent in a maze receives +10 only when it reaches the exit, 0 otherwise. Which challenge does it face?",
      "options": [
        "High variance in rewards",
        "Discount factor issues",
        "Immediate feedback overload",
        "Sparse rewards making learning slow"
      ],
      "correctAnswerIndex": 3,
      "explanation": "The agent gets feedback only at the goal, so intermediate steps provide no reward, slowing learning."
    },
    {
      "id": 82,
      "questionText": "A delivery robot gets reward for each package delivered but penalty for hitting obstacles. How should reward shaping be applied?",
      "options": [
        "Add small negative reward for each step to encourage faster delivery",
        "Ignore obstacle penalties",
        "Increase discount factor to 1",
        "Provide reward only at end"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Adding small negative step reward incentivizes faster goal completion while maintaining obstacle penalties."
    },
    {
      "id": 83,
      "questionText": "In a stock trading simulation, the agent receives reward only when selling stock at profit. What issue arises?",
      "options": [
        "Overfitting to stock price",
        "Discount factor becomes negative",
        "Sparse delayed rewards can make learning inefficient",
        "Immediate feedback causes instability"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Sparse and delayed reward makes it harder for the agent to learn which actions contributed to eventual profit."
    },
    {
      "id": 84,
      "questionText": "An agent in a gridworld receives +1 for moving closer to the goal and -1 for moving away. This is an example of:",
      "options": [
        "Shaped rewards",
        "Random rewards",
        "Sparse rewards",
        "Negative-only rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reward shaping provides continuous guidance, encouraging progress toward the goal."
    },
    {
      "id": 85,
      "questionText": "In a self-driving car simulation, if the agent only receives reward at destination, what would help learning?",
      "options": [
        "Randomizing rewards",
        "Removing penalties",
        "Adding intermediate rewards for staying in lane and avoiding collisions",
        "Reducing discount factor to 0"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Intermediate rewards guide agent step-by-step, improving learning efficiency."
    },
    {
      "id": 86,
      "questionText": "A robot arm is learning to stack blocks. It receives reward only when the tower is complete. Which method helps?",
      "options": [
        "Reward shaping with intermediate points for partial stacking",
        "Increase exploration to maximum",
        "Ignore intermediate failures",
        "Reduce learning rate"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Providing partial rewards for successful sub-tasks speeds up learning in sparse reward settings."
    },
    {
      "id": 87,
      "questionText": "In a scenario where the agent must navigate a dynamic environment with moving obstacles, which approach improves value estimation?",
      "options": [
        "Monte Carlo only",
        "Ignore moving obstacles in rewards",
        "Random exploration without value update",
        "TD(λ) with bootstrapping for faster updates"
      ],
      "correctAnswerIndex": 3,
      "explanation": "TD(λ) allows combining short-term and long-term rewards for more efficient learning in dynamic environments."
    },
    {
      "id": 88,
      "questionText": "A drone receives small negative reward for battery usage and positive reward for reaching checkpoints. What does this reward structure achieve?",
      "options": [
        "Balances energy consumption and goal achievement",
        "Only optimizes immediate reward",
        "Encourages ignoring battery constraints",
        "Maximizes random exploration"
      ],
      "correctAnswerIndex": 0,
      "explanation": "The reward function encourages completing goals efficiently while minimizing energy use."
    },
    {
      "id": 89,
      "questionText": "In a game, an agent finds a loophole to repeatedly collect small rewards instead of completing main quest. This is called:",
      "options": [
        "Reward hacking",
        "TD error",
        "Sparse reward",
        "Bootstrapping"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reward hacking occurs when the agent exploits unintended reward sources instead of completing intended tasks."
    },
    {
      "id": 90,
      "questionText": "An agent trained with high discount factor γ in a long-horizon task may:",
      "options": [
        "Fail to explore",
        "Focus on long-term rewards, sometimes ignoring immediate gains",
        "Focus only on immediate reward",
        "Ignore reward function"
      ],
      "correctAnswerIndex": 1,
      "explanation": "High γ emphasizes future rewards, making the agent prioritize long-term outcomes."
    },
    {
      "id": 91,
      "questionText": "In a simulation where an agent has multiple goals with different rewards, what is crucial for learning correct value estimates?",
      "options": [
        "Properly scaling rewards to reflect relative importance",
        "Randomizing reward signals",
        "Ignoring discount factor",
        "Using immediate reward only"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Scaling rewards ensures that high-priority goals dominate learning without distorting overall behavior."
    },
    {
      "id": 92,
      "questionText": "If an agent receives stochastic rewards from the same action, value estimation must account for:",
      "options": [
        "Ignoring stochasticity",
        "Expected value and variance",
        "TD error δ=0",
        "Immediate reward only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Stochastic rewards require estimating expected return and possibly managing variance to stabilize learning."
    },
    {
      "id": 93,
      "questionText": "In multi-step tasks, an agent that overestimates future rewards may:",
      "options": [
        "Choose risky actions expecting high payoff",
        "Always follow short-term reward",
        "Ignore environment",
        "Fail to update value functions"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Overestimation in Q-values can lead to overly optimistic and risky behavior."
    },
    {
      "id": 94,
      "questionText": "Which method helps reduce high variance in Monte Carlo returns for episodic tasks?",
      "options": [
        "Sparse reward only",
        "TD bootstrapping",
        "Increase learning rate",
        "Ignore intermediate rewards"
      ],
      "correctAnswerIndex": 1,
      "explanation": "TD bootstrapping uses estimates from next state, reducing variance compared to full-episode returns."
    },
    {
      "id": 95,
      "questionText": "A self-learning agent plays a competitive game. It wins small points frequently but big points only on rare strategies. How should rewards be structured?",
      "options": [
        "Give only big rewards at game end",
        "Randomize reward assignment",
        "Remove small rewards entirely",
        "Balance frequent small rewards and rare big rewards to guide strategy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Balanced reward shaping ensures agent explores both common and rare valuable strategies."
    },
    {
      "id": 96,
      "questionText": "Agent operates in continuous state space where exact Q-values cannot be stored. Which approach is needed?",
      "options": [
        "Monte Carlo with tables",
        "Tabular Q-learning",
        "Function approximation (e.g., neural networks)",
        "Ignore approximation and use TD only"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Continuous spaces require approximating value functions to generalize across states."
    },
    {
      "id": 97,
      "questionText": "During training, the agent finds a shortcut to maximize reward but violates intended task. To fix this:",
      "options": [
        "Redesign reward function to reflect intended goals",
        "Reduce discount factor to 0",
        "Increase exploration only",
        "Remove all negative rewards"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Proper reward design prevents reward hacking and aligns learning with intended objectives."
    },
    {
      "id": 98,
      "questionText": "An agent receives conflicting rewards for two simultaneous objectives. How should value estimates be handled?",
      "options": [
        "Use weighted combination of rewards for single value estimate",
        "Ignore one objective",
        "Use random selection",
        "Reduce discount factor to 0"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Weighted sum ensures both objectives influence learning appropriately."
    },
    {
      "id": 99,
      "questionText": "In a delayed reward task, which technique accelerates learning?",
      "options": [
        "Reward shaping with intermediate milestones",
        "Reducing learning rate",
        "Ignoring discount factor",
        "Random action selection only"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Providing intermediate rewards guides agent through long sequences to the final goal."
    },
    {
      "id": 100,
      "questionText": "A reinforcement learning agent in an environment with stochastic transitions and rewards can improve value estimation using:",
      "options": [
        "TD learning with averaging or function approximation",
        "Immediate reward only",
        "Ignoring stochasticity",
        "Random actions without learning"
      ],
      "correctAnswerIndex": 0,
      "explanation": "TD methods combined with averaging or function approximation help stabilize learning in stochastic environments."
    }
  ]
}