{ "title": "Reinforcement Learning Mastery: 100 MCQs", "description": "A comprehensive set of 100 multiple-choice questions on Reinforcement Learning, covering rewards, value functions, and core algorithms.", "questions": [ { "id": 1, "questionText": "In reinforcement learning, what is the reward?", "options": [ "A vector representing all possible actions", "A deterministic sequence of states", "A scalar feedback signal indicating how good the last action was", "The final goal of the environment" ], "correctAnswerIndex": 2, "explanation": "Reward is the immediate scalar feedback from the environment that tells the agent how good its action was." }, { "id": 2, "questionText": "The cumulative sum of future rewards is called:", "options": [ "Transition probability", "Value function", "State space", "Policy" ], "correctAnswerIndex": 1, "explanation": "Value function estimates the expected total (cumulative) reward an agent can get from a state or state-action pair." }, { "id": 3, "questionText": "Which term represents immediate reward at time t?", "options": [ "s_t", "v_t", "π_t", "r_t" ], "correctAnswerIndex": 3, "explanation": "r_t denotes the reward received at the current time step t." }, { "id": 4, "questionText": "Which of the following is TRUE about the value function V(s)?", "options": [ "It measures reward only at the next step", "It gives expected cumulative reward starting from state s", "It is a policy-independent constant", "It directly outputs the best action" ], "correctAnswerIndex": 1, "explanation": "V(s) estimates the expected sum of future rewards starting from state s following a policy π." }, { "id": 5, "questionText": "Discount factor γ is used to:", "options": [ "Ignore past rewards", "Increase the reward infinitely", "Weight future rewards less than immediate rewards", "Randomize state transitions" ], "correctAnswerIndex": 2, "explanation": "Discount factor 0 ≤ γ ≤ 1 ensures future rewards are worth less than immediate ones." }, { "id": 6, "questionText": "Q(s, a) represents:", "options": [ "Probability of next state", "Policy mapping", "Value of taking action a in state s", "Immediate reward only" ], "correctAnswerIndex": 2, "explanation": "Q-function measures expected cumulative reward when taking action a in state s and then following policy π." }, { "id": 7, "questionText": "The difference between expected reward and actual reward is called:", "options": [ "Discount factor", "Greedy error", "Temporal Difference (TD) error", "Policy gradient" ], "correctAnswerIndex": 2, "explanation": "TD error δ = r + γV(s') − V(s) measures how much the predicted value differs from observed reward." }, { "id": 8, "questionText": "Immediate reward is:", "options": [ "A policy parameter", "Sum of all future rewards", "The feedback obtained right after an action", "Probability of action success" ], "correctAnswerIndex": 2, "explanation": "Immediate reward is the feedback signal received immediately after taking an action in a state." }, { "id": 9, "questionText": "Which function tells the value of a state under a policy π?", "options": [ "Q-value function Q(s,a)", "Reward function R(s)", "State value function V(s)", "Transition function T(s,a)" ], "correctAnswerIndex": 2, "explanation": "V(s) gives expected cumulative reward starting from state s under policy π." }, { "id": 10, "questionText": "Which function evaluates both state and action pair?", "options": [ "V-value function V(s)", "Discount function γ", "Q-value function Q(s, a)", "Reward function R(s)" ], "correctAnswerIndex": 2, "explanation": "Q(s,a) evaluates expected cumulative reward for taking action a in state s and then following policy π." }, { "id": 11, "questionText": "What is the purpose of a reward function R(s,a)?", "options": [ "To define environment dynamics", "To store past transitions", "To map states to actions deterministically", "To provide feedback to agent about quality of actions" ], "correctAnswerIndex": 3, "explanation": "Reward function defines the immediate payoff received by the agent for taking an action in a state." }, { "id": 12, "questionText": "Which value function is policy-specific?", "options": [ "R(s,a)", "V*(s)", "Q*(s,a)", "Vπ(s)" ], "correctAnswerIndex": 3, "explanation": "Vπ(s) depends on the specific policy π being followed." }, { "id": 13, "questionText": "What is the difference between V(s) and Q(s,a)?", "options": [ "V(s) considers only state; Q(s,a) considers state-action pair", "They are identical", "V(s) is deterministic; Q(s,a) is random", "V(s) gives immediate reward; Q(s,a) gives discounted reward" ], "correctAnswerIndex": 0, "explanation": "V(s) measures value of a state; Q(s,a) measures value of taking a specific action in that state." }, { "id": 14, "questionText": "If γ=0 in RL, the agent:", "options": [ "Considers only immediate rewards", "Maximizes long-term reward", "Ignores rewards completely", "Considers all future rewards equally" ], "correctAnswerIndex": 0, "explanation": "γ=0 makes the agent short-sighted, focusing only on immediate reward." }, { "id": 15, "questionText": "If γ approaches 1, the agent:", "options": [ "Stops learning", "Values future rewards almost as much as immediate rewards", "Ignores future rewards", "Becomes random" ], "correctAnswerIndex": 1, "explanation": "High γ makes the agent far-sighted, considering long-term consequences." }, { "id": 16, "questionText": "Which formula defines TD learning update for value function?", "options": [ "V(s) ← r only", "Q(s,a) ← r + γmax Q(s',a')", "V(s) ← γ V(s')", "V(s) ← V(s) + α[r + γV(s') − V(s)]" ], "correctAnswerIndex": 3, "explanation": "TD update modifies V(s) toward observed reward plus discounted next state value." }, { "id": 17, "questionText": "In RL, reward shaping is used to:", "options": [ "Provide additional intermediate rewards to guide learning", "Simplify environment dynamics", "Randomize action selection", "Remove future rewards" ], "correctAnswerIndex": 0, "explanation": "Reward shaping helps the agent learn faster by providing informative intermediate feedback." }, { "id": 18, "questionText": "Expected cumulative reward starting from state s and following policy π is:", "options": [ "Q*(s,a)", "R(s)", "Vπ(s)", "γ(s)" ], "correctAnswerIndex": 2, "explanation": "Vπ(s) is the expected sum of discounted rewards under policy π starting at state s." }, { "id": 19, "questionText": "Which reward type encourages agent to achieve long-term goal?", "options": [ "Random reward", "Immediate reward only", "Negative reward only", "Sparse reward" ], "correctAnswerIndex": 3, "explanation": "Sparse or delayed rewards push the agent to consider long-term strategy." }, { "id": 20, "questionText": "Which function gives the best achievable expected reward from a state?", "options": [ "Immediate reward function R(s)", "Optimal value function V*(s)", "Qπ(s,a)", "Policy function π(s)" ], "correctAnswerIndex": 1, "explanation": "V*(s) represents the maximum expected cumulative reward achievable from state s." }, { "id": 21, "questionText": "Q*(s,a) represents:", "options": [ "Discount factor", "Maximum expected reward for taking action a in state s and following optimal policy", "Immediate reward only", "Transition probability" ], "correctAnswerIndex": 1, "explanation": "Q*(s,a) estimates the optimal expected return for a specific state-action pair." }, { "id": 22, "questionText": "If the reward function is poorly designed, the agent may:", "options": [ "Ignore environment", "Learn undesired behavior", "Increase exploration automatically", "Immediately converge to optimal policy" ], "correctAnswerIndex": 1, "explanation": "Incorrect reward leads to reward hacking — agent may maximize reward in unintended ways." }, { "id": 23, "questionText": "Discounted future reward is calculated as:", "options": [ "γ only", "r_t only", "r_t + γ r_{t+1} + γ^2 r_{t+2} + …", "Sum of unweighted rewards" ], "correctAnswerIndex": 2, "explanation": "Discounted sum reduces importance of rewards further in the future using γ." }, { "id": 24, "questionText": "What is the purpose of Q-learning?", "options": [ "To generate random actions", "To directly update policy probabilities", "To learn the optimal action-value function", "To compute rewards only" ], "correctAnswerIndex": 2, "explanation": "Q-learning seeks to learn Q*(s,a) — the optimal expected cumulative reward function." }, { "id": 25, "questionText": "Monte Carlo methods estimate value function using:", "options": [ "TD error", "Actual returns from complete episodes", "Policy gradient", "Random rewards" ], "correctAnswerIndex": 1, "explanation": "Monte Carlo calculates V(s) or Q(s,a) using the sum of rewards observed in full episodes." }, { "id": 26, "questionText": "Bootstrapping in value function estimation refers to:", "options": [ "Resetting environment every step", "Estimating current value using future estimated values", "Using only random actions", "Ignoring future rewards" ], "correctAnswerIndex": 1, "explanation": "Bootstrapping updates estimates using other current estimates (e.g., TD learning)." }, { "id": 27, "questionText": "Which method combines bootstrapping and Monte Carlo ideas for value estimation?", "options": [ "SARSA", "Q-learning", "TD(λ) learning", "Policy gradient" ], "correctAnswerIndex": 2, "explanation": "TD(λ) uses λ parameter to mix Monte Carlo and TD bootstrapping for more stable learning." }, { "id": 28, "questionText": "What does SARSA stand for?", "options": [ "Stochastic-Action-Reward-State-Algorithm", "State-Action-Reward-State-Action", "Supervised-Action-Reward-State-Agent", "State-Action-Reward-Sequence-Approximation" ], "correctAnswerIndex": 1, "explanation": "SARSA updates Q-values using the current state, action, reward, next state, and next action." }, { "id": 29, "questionText": "Which of the following is TRUE about Q-learning?", "options": [ "It only works for deterministic environments", "It is on-policy and depends on agent’s current behavior", "It is off-policy and learns the optimal Q regardless of agent’s actions", "It ignores rewards completely" ], "correctAnswerIndex": 2, "explanation": "Q-learning is off-policy: it learns Q*(s,a) while following a different policy for action selection." }, { "id": 30, "questionText": "Which parameter balances importance of immediate vs future rewards?", "options": [ "Reward function R", "Exploration rate ε", "Learning rate α", "Discount factor γ" ], "correctAnswerIndex": 3, "explanation": "γ determines how much future rewards contribute to current value estimates." }, { "id": 31, "questionText": "A sparse reward environment means:", "options": [ "Rewards are continuous and immediate", "Rewards are given infrequently, usually only on goal completion", "All states give the same reward", "Rewards are always negative" ], "correctAnswerIndex": 1, "explanation": "Sparse reward settings give feedback rarely, making learning more challenging." }, { "id": 32, "questionText": "In value-based RL, what is the primary goal of the agent?", "options": [ "Minimize immediate reward", "Maximize cumulative discounted reward", "Randomly explore environment", "Reduce state space" ], "correctAnswerIndex": 1, "explanation": "The agent selects actions that maximize expected cumulative rewards over time." }, { "id": 33, "questionText": "What is the Bellman equation for V(s)?", "options": [ "V(s) = γ^t * r_t", "V(s) = E[r + γV(s’)]", "V(s) = r only", "V(s) = max Q(s,a)" ], "correctAnswerIndex": 1, "explanation": "Bellman equation expresses value as immediate reward plus discounted expected value of next state." }, { "id": 34, "questionText": "Which function represents long-term expected reward from taking a specific action?", "options": [ "V(s)", "γ(s)", "R(s)", "Q(s,a)" ], "correctAnswerIndex": 3, "explanation": "Q(s,a) evaluates cumulative reward starting with a specific action." }, { "id": 35, "questionText": "Which function estimates the maximum reward achievable from state s?", "options": [ "Qπ(s,a)", "Vπ(s)", "V*(s)", "R(s)" ], "correctAnswerIndex": 2, "explanation": "V*(s) is the optimal value function representing maximum achievable reward." }, { "id": 36, "questionText": "Temporal difference learning updates value estimates using:", "options": [ "Observed reward + estimated value of next state", "Random guesses", "Policy gradient", "Only immediate reward" ], "correctAnswerIndex": 0, "explanation": "TD uses bootstrapping: V(s) ← V(s) + α[r + γV(s') − V(s)]." }, { "id": 37, "questionText": "Which approach requires full episodes to update values?", "options": [ "TD learning", "Monte Carlo", "SARSA", "Q-learning" ], "correctAnswerIndex": 1, "explanation": "Monte Carlo estimates values based on actual returns from complete episodes." }, { "id": 38, "questionText": "Reward shaping is beneficial because it:", "options": [ "Eliminates exploration", "Guarantees deterministic policy", "Removes the discount factor", "Speeds up learning by giving intermediate rewards" ], "correctAnswerIndex": 3, "explanation": "Reward shaping provides guidance to the agent via extra signals." }, { "id": 39, "questionText": "Which of these is a disadvantage of sparse rewards?", "options": [ "Reward scaling issues", "Immediate overfitting", "Slower convergence and learning difficulty", "Exploration elimination" ], "correctAnswerIndex": 2, "explanation": "Sparse rewards provide limited feedback, making learning slower and exploration harder." }, { "id": 40, "questionText": "Which RL method learns directly from Q-values without policy?", "options": [ "Monte Carlo policy evaluation", "Value-based methods (e.g., Q-learning)", "Actor-Critic", "Policy gradient" ], "correctAnswerIndex": 1, "explanation": "Value-based methods estimate Q-values and derive actions via max(Q) instead of learning policy directly." }, { "id": 41, "questionText": "The TD error δ = r + γV(s') − V(s) is used to:", "options": [ "Update value estimates incrementally", "Determine next action", "Select best policy directly", "Compute discount factor" ], "correctAnswerIndex": 0, "explanation": "TD error measures prediction discrepancy to adjust value function gradually." }, { "id": 42, "questionText": "Why is Q*(s,a) considered optimal?", "options": [ "It gives immediate reward", "It ignores state transitions", "It represents maximum expected reward achievable by any policy", "It is randomly assigned" ], "correctAnswerIndex": 2, "explanation": "Q* provides the best action-value estimates regardless of current policy." }, { "id": 43, "questionText": "Which concept allows estimating future rewards without waiting for episode completion?", "options": [ "Reward clipping", "Monte Carlo", "Sparse reward", "Bootstrapping (TD learning)" ], "correctAnswerIndex": 3, "explanation": "Bootstrapping updates values using estimates of next state instead of waiting for full episode." }, { "id": 44, "questionText": "A discount factor γ close to 0 leads to:", "options": [ "Far-sighted agent", "Infinite reward accumulation", "Short-sighted agent focusing on immediate rewards", "Random action selection" ], "correctAnswerIndex": 2, "explanation": "Low γ reduces the weight of future rewards in value estimates." }, { "id": 45, "questionText": "A discount factor γ close to 1 leads to:", "options": [ "Far-sighted agent valuing future rewards", "No learning", "Randomized reward", "Immediate reward focus" ], "correctAnswerIndex": 0, "explanation": "High γ makes the agent long-term focused, considering distant rewards." }, { "id": 46, "questionText": "Which function guides agent behavior by evaluating future reward potential?", "options": [ "Reward function only", "State-action mapping", "Value function", "Transition function" ], "correctAnswerIndex": 2, "explanation": "Value functions estimate future reward potential, indirectly guiding actions." }, { "id": 47, "questionText": "Which method combines state and action evaluation to choose optimal moves?", "options": [ "TD(0) only", "Q-function", "V-function", "Monte Carlo only" ], "correctAnswerIndex": 1, "explanation": "Q(s,a) evaluates expected return for state-action pairs." }, { "id": 48, "questionText": "Which term measures the quality of an action in a state?", "options": [ "γ", "Reward shaping", "Q-value", "V-value" ], "correctAnswerIndex": 2, "explanation": "Q-value estimates long-term expected reward for taking a specific action." }, { "id": 49, "questionText": "Value function approximation is necessary when:", "options": [ "Actions are discrete", "State space is small", "Rewards are deterministic", "State space is too large or continuous" ], "correctAnswerIndex": 3, "explanation": "Large or continuous state spaces make tabular value storage impractical." }, { "id": 50, "questionText": "Which method learns policy indirectly via value estimates?", "options": [ "Actor-Critic only", "Value-based RL", "Monte Carlo only", "Policy gradient" ], "correctAnswerIndex": 1, "explanation": "Value-based methods choose actions via max(Q) without learning policy parameters directly." }, { "id": 51, "questionText": "In a deterministic environment, TD(0) converges to:", "options": [ "Immediate rewards only", "Random values", "True state values V(s)", "Policy parameters" ], "correctAnswerIndex": 2, "explanation": "TD(0) converges to correct V(s) if learning rate and exploration conditions are met." }, { "id": 52, "questionText": "Bootstrapping can introduce bias but reduces:", "options": [ "Variance in estimates", "Immediate rewards", "Policy randomness", "Learning rate" ], "correctAnswerIndex": 0, "explanation": "TD bootstrapping reduces variance at the cost of some bias." }, { "id": 53, "questionText": "The max operator in Q-learning helps:", "options": [ "Compute TD error only", "Discount rewards", "Randomize exploration", "Choose action with highest estimated return" ], "correctAnswerIndex": 3, "explanation": "max_a Q(s’,a) selects the action with highest expected value for next state." }, { "id": 54, "questionText": "Q-learning is considered off-policy because:", "options": [ "It learns optimal Q regardless of agent’s current actions", "It uses random rewards only", "It ignores state transitions", "It directly follows current policy" ], "correctAnswerIndex": 0, "explanation": "Off-policy learning allows learning of Q* while following exploratory policy." }, { "id": 55, "questionText": "Which function provides guidance for immediate action selection?", "options": [ "V(s)", "Reward function", "Discount factor", "Q(s,a)" ], "correctAnswerIndex": 3, "explanation": "Q-values indicate which action in current state yields highest expected reward." }, { "id": 56, "questionText": "Monte Carlo updates are unbiased but have:", "options": [ "High variance", "Immediate convergence", "No error", "Low variance" ], "correctAnswerIndex": 0, "explanation": "Monte Carlo estimates can vary widely between episodes, leading to high variance." }, { "id": 57, "questionText": "Which value function is used in policy iteration to evaluate policy?", "options": [ "Q*(s,a)", "R(s)", "Vπ(s)", "V*(s)" ], "correctAnswerIndex": 2, "explanation": "Policy evaluation uses Vπ(s) to estimate expected return under policy π." }, { "id": 58, "questionText": "Temporal difference methods combine Monte Carlo ideas and:", "options": [ "Policy gradients", "Reward clipping", "Bootstrapping", "Random exploration" ], "correctAnswerIndex": 2, "explanation": "TD methods use bootstrapping to estimate value based on next state’s current value." }, { "id": 59, "questionText": "Sparse rewards make RL more challenging because:", "options": [ "Policy gradient fails", "Agent receives little guidance during learning", "Discount factor becomes irrelevant", "Agent converges immediately" ], "correctAnswerIndex": 1, "explanation": "Without frequent feedback, the agent struggles to learn correct action-value mapping." }, { "id": 60, "questionText": "Which term describes expected future reward from a state-action pair?", "options": [ "V(s)", "R(s)", "Q(s,a)", "γ" ], "correctAnswerIndex": 2, "explanation": "Q(s,a) measures cumulative expected reward starting from that action." }, { "id": 61, "questionText": "Which method updates value functions continuously after every step?", "options": [ "Monte Carlo", "Reward shaping", "TD learning", "Policy gradient" ], "correctAnswerIndex": 2, "explanation": "TD learning updates V(s) incrementally using observed reward and next state value." }, { "id": 62, "questionText": "Which value function guides long-term planning in RL?", "options": [ "Policy entropy", "Reward only", "Immediate next state", "V(s) and Q(s,a)" ], "correctAnswerIndex": 3, "explanation": "V(s) and Q(s,a) provide estimates of cumulative future reward for planning actions." }, { "id": 63, "questionText": "Which is true about bootstrapped TD updates?", "options": [ "They are only for deterministic environments", "They ignore discount factor", "They reduce variance compared to Monte Carlo", "They eliminate reward function" ], "correctAnswerIndex": 2, "explanation": "Bootstrapping reduces variance but introduces bias, unlike full-episode Monte Carlo." }, { "id": 64, "questionText": "Which parameter determines learning step size in TD updates?", "options": [ "γ (discount factor)", "ε (exploration)", "α (learning rate)", "λ (trace decay)" ], "correctAnswerIndex": 2, "explanation": "α controls how much each update adjusts the current value estimate." }, { "id": 65, "questionText": "Which function represents optimal action-value function?", "options": [ "Vπ(s)", "Q*(s,a)", "R(s)", "V*(s)" ], "correctAnswerIndex": 1, "explanation": "Q*(s,a) gives the best achievable return for a state-action pair following optimal policy." }, { "id": 66, "questionText": "Which scenario illustrates reward hacking?", "options": [ "Agent stops learning", "Agent explores randomly", "Agent finds shortcut to maximize reward but violates task intention", "Agent follows optimal policy" ], "correctAnswerIndex": 2, "explanation": "Reward hacking occurs when agent exploits unintended loopholes in reward function." }, { "id": 67, "questionText": "Which function is used to derive greedy action selection?", "options": [ "Q(s,a)", "V(s)", "R(s)", "γ" ], "correctAnswerIndex": 0, "explanation": "Greedy selection picks action with maximum Q-value in current state." }, { "id": 68, "questionText": "Which parameter λ in TD(λ) balances:", "options": [ "Exploration vs exploitation", "Monte Carlo vs TD updates", "Immediate vs sparse reward", "Learning rate vs discount factor" ], "correctAnswerIndex": 1, "explanation": "λ mixes short-term TD updates with long-term Monte Carlo returns." }, { "id": 69, "questionText": "Why are value function approximators needed in large environments?", "options": [ "State space too large for tabular methods", "Discount factor irrelevant", "Policy gradients fail", "Rewards are deterministic" ], "correctAnswerIndex": 0, "explanation": "Function approximation allows generalization when storing values for every state is impossible." }, { "id": 70, "questionText": "Which function measures discrepancy between predicted and observed reward?", "options": [ "γ", "Q-value", "TD error δ", "V(s)" ], "correctAnswerIndex": 2, "explanation": "TD error δ = r + γV(s') − V(s) indicates prediction mismatch for updating values." }, { "id": 71, "questionText": "An agent consistently receives +1 reward only at goal completion. This is an example of:", "options": [ "Dense reward", "Negative reward", "Shaped reward", "Sparse reward" ], "correctAnswerIndex": 3, "explanation": "Sparse reward occurs when feedback is only given at task completion." }, { "id": 72, "questionText": "If Q(s,a) underestimates future rewards, the agent may:", "options": [ "Avoid valuable actions", "Ignore discount factor", "Converge instantly", "Overexplore" ], "correctAnswerIndex": 0, "explanation": "Underestimated Q-values mislead agent to ignore actions with high actual returns." }, { "id": 73, "questionText": "In episodic tasks, value function returns are calculated until:", "options": [ "First reward", "Episode ends", "Discount factor γ=0", "Next action" ], "correctAnswerIndex": 1, "explanation": "Episodic tasks compute total return from start state until terminal state." }, { "id": 74, "questionText": "Expected reward from a state following policy π is given by:", "options": [ "γ", "Vπ(s)", "R(s)", "Q*(s,a)" ], "correctAnswerIndex": 1, "explanation": "Vπ(s) = E[Σ γ^t r_t | s, π] is the formal definition." }, { "id": 75, "questionText": "Q-learning update formula is:", "options": [ "V(s) ← r only", "Policy π(s) ← π(s) + α", "Q(s,a) ← Q(s,a) + α[r + γ max Q(s’,a’) − Q(s,a)]", "TD error δ = r − V(s)" ], "correctAnswerIndex": 2, "explanation": "Q-learning uses max Q of next state to update current action value." }, { "id": 76, "questionText": "Which factor encourages exploration in value-based methods?", "options": [ "TD error δ", "ε-greedy policy", "Discount factor γ", "Learning rate α" ], "correctAnswerIndex": 1, "explanation": "ε-greedy policy selects random actions with small probability to explore new states." }, { "id": 77, "questionText": "Which method estimates Q(s,a) while following the same policy?", "options": [ "Monte Carlo", "SARSA (on-policy)", "TD(λ)", "Q-learning (off-policy)" ], "correctAnswerIndex": 1, "explanation": "SARSA uses next action chosen by current policy for updates." }, { "id": 78, "questionText": "Which technique combines immediate and future reward estimation in TD learning?", "options": [ "Monte Carlo only", "Bootstrapping", "Random policy", "Greedy selection" ], "correctAnswerIndex": 1, "explanation": "Bootstrapping blends observed reward with estimated next state value." }, { "id": 79, "questionText": "Which value function provides the highest possible expected return?", "options": [ "Immediate reward function R(s)", "Policy-specific Vπ(s)", "TD error δ", "Optimal value function V*(s)" ], "correctAnswerIndex": 3, "explanation": "V*(s) represents maximum expected cumulative reward from state s." }, { "id": 80, "questionText": "Reward shaping helps RL agent by:", "options": [ "Giving intermediate rewards to guide learning", "Eliminating exploration entirely", "Forcing deterministic actions", "Changing discount factor" ], "correctAnswerIndex": 0, "explanation": "Shaped rewards provide additional feedback to accelerate learning." }, { "id": 81, "questionText": "An agent in a maze receives +10 only when it reaches the exit, 0 otherwise. Which challenge does it face?", "options": [ "High variance in rewards", "Discount factor issues", "Immediate feedback overload", "Sparse rewards making learning slow" ], "correctAnswerIndex": 3, "explanation": "The agent gets feedback only at the goal, so intermediate steps provide no reward, slowing learning." }, { "id": 82, "questionText": "A delivery robot gets reward for each package delivered but penalty for hitting obstacles. How should reward shaping be applied?", "options": [ "Add small negative reward for each step to encourage faster delivery", "Ignore obstacle penalties", "Increase discount factor to 1", "Provide reward only at end" ], "correctAnswerIndex": 0, "explanation": "Adding small negative step reward incentivizes faster goal completion while maintaining obstacle penalties." }, { "id": 83, "questionText": "In a stock trading simulation, the agent receives reward only when selling stock at profit. What issue arises?", "options": [ "Overfitting to stock price", "Discount factor becomes negative", "Sparse delayed rewards can make learning inefficient", "Immediate feedback causes instability" ], "correctAnswerIndex": 2, "explanation": "Sparse and delayed reward makes it harder for the agent to learn which actions contributed to eventual profit." }, { "id": 84, "questionText": "An agent in a gridworld receives +1 for moving closer to the goal and -1 for moving away. This is an example of:", "options": [ "Shaped rewards", "Random rewards", "Sparse rewards", "Negative-only rewards" ], "correctAnswerIndex": 0, "explanation": "Reward shaping provides continuous guidance, encouraging progress toward the goal." }, { "id": 85, "questionText": "In a self-driving car simulation, if the agent only receives reward at destination, what would help learning?", "options": [ "Randomizing rewards", "Removing penalties", "Adding intermediate rewards for staying in lane and avoiding collisions", "Reducing discount factor to 0" ], "correctAnswerIndex": 2, "explanation": "Intermediate rewards guide agent step-by-step, improving learning efficiency." }, { "id": 86, "questionText": "A robot arm is learning to stack blocks. It receives reward only when the tower is complete. Which method helps?", "options": [ "Reward shaping with intermediate points for partial stacking", "Increase exploration to maximum", "Ignore intermediate failures", "Reduce learning rate" ], "correctAnswerIndex": 0, "explanation": "Providing partial rewards for successful sub-tasks speeds up learning in sparse reward settings." }, { "id": 87, "questionText": "In a scenario where the agent must navigate a dynamic environment with moving obstacles, which approach improves value estimation?", "options": [ "Monte Carlo only", "Ignore moving obstacles in rewards", "Random exploration without value update", "TD(λ) with bootstrapping for faster updates" ], "correctAnswerIndex": 3, "explanation": "TD(λ) allows combining short-term and long-term rewards for more efficient learning in dynamic environments." }, { "id": 88, "questionText": "A drone receives small negative reward for battery usage and positive reward for reaching checkpoints. What does this reward structure achieve?", "options": [ "Balances energy consumption and goal achievement", "Only optimizes immediate reward", "Encourages ignoring battery constraints", "Maximizes random exploration" ], "correctAnswerIndex": 0, "explanation": "The reward function encourages completing goals efficiently while minimizing energy use." }, { "id": 89, "questionText": "In a game, an agent finds a loophole to repeatedly collect small rewards instead of completing main quest. This is called:", "options": [ "Reward hacking", "TD error", "Sparse reward", "Bootstrapping" ], "correctAnswerIndex": 0, "explanation": "Reward hacking occurs when the agent exploits unintended reward sources instead of completing intended tasks." }, { "id": 90, "questionText": "An agent trained with high discount factor γ in a long-horizon task may:", "options": [ "Fail to explore", "Focus on long-term rewards, sometimes ignoring immediate gains", "Focus only on immediate reward", "Ignore reward function" ], "correctAnswerIndex": 1, "explanation": "High γ emphasizes future rewards, making the agent prioritize long-term outcomes." }, { "id": 91, "questionText": "In a simulation where an agent has multiple goals with different rewards, what is crucial for learning correct value estimates?", "options": [ "Properly scaling rewards to reflect relative importance", "Randomizing reward signals", "Ignoring discount factor", "Using immediate reward only" ], "correctAnswerIndex": 0, "explanation": "Scaling rewards ensures that high-priority goals dominate learning without distorting overall behavior." }, { "id": 92, "questionText": "If an agent receives stochastic rewards from the same action, value estimation must account for:", "options": [ "Ignoring stochasticity", "Expected value and variance", "TD error δ=0", "Immediate reward only" ], "correctAnswerIndex": 1, "explanation": "Stochastic rewards require estimating expected return and possibly managing variance to stabilize learning." }, { "id": 93, "questionText": "In multi-step tasks, an agent that overestimates future rewards may:", "options": [ "Choose risky actions expecting high payoff", "Always follow short-term reward", "Ignore environment", "Fail to update value functions" ], "correctAnswerIndex": 0, "explanation": "Overestimation in Q-values can lead to overly optimistic and risky behavior." }, { "id": 94, "questionText": "Which method helps reduce high variance in Monte Carlo returns for episodic tasks?", "options": [ "Sparse reward only", "TD bootstrapping", "Increase learning rate", "Ignore intermediate rewards" ], "correctAnswerIndex": 1, "explanation": "TD bootstrapping uses estimates from next state, reducing variance compared to full-episode returns." }, { "id": 95, "questionText": "A self-learning agent plays a competitive game. It wins small points frequently but big points only on rare strategies. How should rewards be structured?", "options": [ "Give only big rewards at game end", "Randomize reward assignment", "Remove small rewards entirely", "Balance frequent small rewards and rare big rewards to guide strategy" ], "correctAnswerIndex": 3, "explanation": "Balanced reward shaping ensures agent explores both common and rare valuable strategies." }, { "id": 96, "questionText": "Agent operates in continuous state space where exact Q-values cannot be stored. Which approach is needed?", "options": [ "Monte Carlo with tables", "Tabular Q-learning", "Function approximation (e.g., neural networks)", "Ignore approximation and use TD only" ], "correctAnswerIndex": 2, "explanation": "Continuous spaces require approximating value functions to generalize across states." }, { "id": 97, "questionText": "During training, the agent finds a shortcut to maximize reward but violates intended task. To fix this:", "options": [ "Redesign reward function to reflect intended goals", "Reduce discount factor to 0", "Increase exploration only", "Remove all negative rewards" ], "correctAnswerIndex": 0, "explanation": "Proper reward design prevents reward hacking and aligns learning with intended objectives." }, { "id": 98, "questionText": "An agent receives conflicting rewards for two simultaneous objectives. How should value estimates be handled?", "options": [ "Use weighted combination of rewards for single value estimate", "Ignore one objective", "Use random selection", "Reduce discount factor to 0" ], "correctAnswerIndex": 0, "explanation": "Weighted sum ensures both objectives influence learning appropriately." }, { "id": 99, "questionText": "In a delayed reward task, which technique accelerates learning?", "options": [ "Reward shaping with intermediate milestones", "Reducing learning rate", "Ignoring discount factor", "Random action selection only" ], "correctAnswerIndex": 0, "explanation": "Providing intermediate rewards guides agent through long sequences to the final goal." }, { "id": 100, "questionText": "A reinforcement learning agent in an environment with stochastic transitions and rewards can improve value estimation using:", "options": [ "TD learning with averaging or function approximation", "Immediate reward only", "Ignoring stochasticity", "Random actions without learning" ], "correctAnswerIndex": 0, "explanation": "TD methods combined with averaging or function approximation help stabilize learning in stochastic environments." } ] }