Spaces:

thecollabagepatch
/

magenta-retry

Running

App Files Files Community

thecollabagepatch commited on Aug 27

Commit

7fe8be5

1 Parent(s): 4bdf506

ok reverting one more time

Browse files

Files changed (2) hide show

jam_worker.py +355 -404
utils.py +36 -62

jam_worker.py CHANGED Viewed

@@ -1,5 +1,5 @@
-# jam_worker.py - COMPREHENSIVE REWRITE FOR PRECISE TIMING
-import threading, time, base64, io, uuid, math
 from dataclasses import dataclass, field
 import numpy as np
 import soundfile as sf
@@ -8,7 +8,7 @@ from threading import RLock
 from utils import (
     match_loudness_to_reference, stitch_generated, hard_trim_seconds,
     apply_micro_fades, make_bar_aligned_context, take_bar_aligned_tail,
-    resample_and_snap, wav_bytes_base64, StreamingResampler
 )
 @dataclass
@@ -32,34 +32,6 @@ class JamChunk:
     audio_base64: str
     metadata: dict
-@dataclass
-class TimingState:
-    """Precise timing state tracking"""
-    # Fractional bar position (never rounded until final emission)
-    emit_position_bars: float = 0.0
-    # Sample-accurate positions in the stream
-    stream_position_samples: int = 0
-    # Accumulated timing error for correction
-    fractional_error_bars: float = 0.0
-    # Codec frame timing
-    frames_per_bar: float = 0.0
-    samples_per_bar: float = 0.0
-    def advance_by_bars(self, bars: float):
-        """Advance timing by exact fractional bars"""
-        self.emit_position_bars += bars
-        self.fractional_error_bars += bars - int(bars)
-        # Correct for accumulated error when it gets significant
-        if abs(self.fractional_error_bars) > 0.5:
-            correction = int(round(self.fractional_error_bars))
-            self.fractional_error_bars -= correction
-            return correction  # bars to skip/rewind
-        return 0
 class JamWorker(threading.Thread):
     def __init__(self, mrt, params: JamParams):
         super().__init__(daemon=True)
@@ -67,32 +39,9 @@ class JamWorker(threading.Thread):
         self.params = params
         self.state = mrt.init_state()
-        # Core timing calculations (keep as floats for precision)
-        self._codec_fps = float(self.mrt.codec.frame_rate)  # 25.0
-        self._model_sr = int(self.mrt.sample_rate)          # 48000
-        self._target_sr = int(params.target_sr)
-        # Critical: these stay as floats to preserve fractional precision
-        self._seconds_per_bar = float(params.beats_per_bar * 60.0 / params.bpm)
-        self._frames_per_bar = self._seconds_per_bar * self._codec_fps
-        self._samples_per_bar_model = self._seconds_per_bar * self._model_sr
-        self._samples_per_bar_target = self._seconds_per_bar * self._target_sr
-        # Timing state
-        self._timing = TimingState(
-            frames_per_bar=self._frames_per_bar,
-            samples_per_bar=self._samples_per_bar_model
-        )
-        # Warn about problematic BPMs
-        frame_error = abs(self._frames_per_bar - round(self._frames_per_bar))
-        if frame_error > 0.01:
-            print(f"⚠️ Warning: {params.bpm} BPM creates {frame_error:.3f} frame drift per bar")
-            print(f"   This may cause gradual timing drift in long jams")
-        # Synchronization + placeholders
         self._lock = threading.Lock()
-        self._original_context_tokens = None
         if params.combined_loop is not None:
             self._setup_context_from_combined_loop()
@@ -101,39 +50,28 @@ class JamWorker(threading.Thread):
         self.outbox: list[JamChunk] = []
         self._stop_event = threading.Event()
-        # Stream state
         self._stream = None
-        self._stream_write_pos = 0  # Where we append new model output
-        # Delivery tracking
         self._last_delivered_index = 0
         self._max_buffer_ahead = 5
-        # Streaming resampler for precise SR conversion
-        self._resampler = None
-        if self._target_sr != self._model_sr:
-            self._resampler = StreamingResampler(
-                in_sr=self._model_sr,
-                out_sr=self._target_sr,
-                channels=2,
-                quality="VHQ"
-            )
         # Timing info
         self.last_chunk_started_at = None
         self.last_chunk_completed_at = None
-        # Control flags
-        self._pending_reseed = None
-        self._needs_bar_realign = False
-        self._reseed_ref_loop = None
     def _setup_context_from_combined_loop(self):
         """Set up MRT context tokens from the combined loop audio"""
         try:
             from utils import make_bar_aligned_context, take_bar_aligned_tail
-            codec_fps = self._codec_fps
             ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
             loop_for_context = take_bar_aligned_tail(
@@ -146,381 +84,452 @@ class JamWorker(threading.Thread):
             tokens_full = self.mrt.codec.encode(loop_for_context).astype(np.int32)
             tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
-            # Use enhanced context alignment for fractional BPMs
             context_tokens = make_bar_aligned_context(
                 tokens,
                 bpm=self.params.bpm,
-                fps=self._codec_fps,
                 ctx_frames=self.mrt.config.context_length_frames,
-                beats_per_bar=self.params.beats_per_bar,
-                precise_timing=True  # Use new precise mode
             )
             self.state.context_tokens = context_tokens
-            print(f"Context setup: {context_tokens.shape[0]} frames, {self._frames_per_bar:.3f} frames/bar")
-            # Store original context for splice reseeding
             with self._lock:
                 if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
-                    self._original_context_tokens = np.copy(context_tokens)
         except Exception as e:
-            print(f"Failed to setup context from combined loop: {e}")
     def stop(self):
         self._stop_event.set()
     def update_knobs(self, *, guidance_weight=None, temperature=None, topk=None):
         with self._lock:
-            if guidance_weight is not None:
-                self.params.guidance_weight = float(guidance_weight)
-            if temperature is not None:
-                self.params.temperature = float(temperature)
-            if topk is not None:
-                self.params.topk = int(topk)
     def get_next_chunk(self) -> JamChunk | None:
         """Get the next sequential chunk (blocks/waits if not ready)"""
         target_index = self._last_delivered_index + 1
-        max_wait = 30.0
         start_time = time.time()
         while time.time() - start_time < max_wait and not self._stop_event.is_set():
             with self._lock:
                 for chunk in self.outbox:
                     if chunk.index == target_index:
                         self._last_delivered_index = target_index
-                        print(f"Delivered chunk {target_index} (bars {chunk.metadata.get('bar_range', 'unknown')})")
                         return chunk
             time.sleep(0.1)
         return None
     def mark_chunk_consumed(self, chunk_index: int):
         """Mark a chunk as consumed by the frontend"""
         with self._lock:
             self._last_delivered_index = max(self._last_delivered_index, chunk_index)
     def _should_generate_next_chunk(self) -> bool:
-        """Check if we should generate the next chunk"""
         with self._lock:
-            return self.idx <= self._last_delivered_index + self._max_buffer_ahead
-    def _get_precise_chunk_samples(self, bars: float) -> int:
-        """Get exact sample count for fractional bars at model SR"""
-        exact_seconds = bars * self._seconds_per_bar
-        return int(round(exact_seconds * self._model_sr))
     def _append_model_chunk_to_stream(self, wav):
-        """Append model output to continuous stream with crossfading"""
         xfade_s = float(self.mrt.config.crossfade_length)
-        sr = self._model_sr
         xfade_n = int(round(xfade_s * sr))
         s = wav.samples if wav.samples.ndim == 2 else wav.samples[:, None]
-        if self._stream is None:
-            # First chunk: drop model pre-roll
             if s.shape[0] > xfade_n:
                 self._stream = s[xfade_n:].astype(np.float32, copy=True)
             else:
                 self._stream = np.zeros((0, s.shape[1]), dtype=np.float32)
-            self._stream_write_pos = self._stream.shape[0]
             return
-        # Crossfade with equal-power curves
         if s.shape[0] <= xfade_n or self._stream.shape[0] < xfade_n:
-            # Degenerate case
             self._stream = np.concatenate([self._stream, s], axis=0)
-            self._stream_write_pos = self._stream.shape[0]
             return
-        # Standard crossfade
         tail = self._stream[-xfade_n:]
         head = s[:xfade_n]
         t = np.linspace(0, np.pi/2, xfade_n, endpoint=False, dtype=np.float32)[:, None]
         eq_in, eq_out = np.sin(t), np.cos(t)
         mixed = tail * eq_out + head * eq_in
         self._stream = np.concatenate([self._stream[:-xfade_n], mixed, s[xfade_n:]], axis=0)
-        self._stream_write_pos = self._stream.shape[0]
-    def _extract_precise_chunk(self, start_bars: float, chunk_bars: float) -> np.ndarray:
-        """Extract exactly chunk_bars worth of audio starting at start_bars"""
-        start_samples = self._get_precise_chunk_samples(start_bars)
-        chunk_samples = self._get_precise_chunk_samples(chunk_bars)
-        end_samples = start_samples + chunk_samples
-        if end_samples > self._stream.shape[0]:
-            return None  # Not enough audio generated yet
-        return self._stream[start_samples:end_samples]
-    def _perform_onset_alignment(self, ref_loop: au.Waveform) -> float:
-        """Estimate timing offset between generated audio and reference"""
-        if self._stream is None or self._stream.shape[0] < self._model_sr:
-            return 0.0
-        try:
-            # Take first ~2 seconds of generated audio
-            gen_samples = min(int(2.0 * self._model_sr), self._stream.shape[0])
-            gen_head = au.Waveform(
-                self._stream[:gen_samples].astype(np.float32, copy=False),
-                self._model_sr
-            ).as_stereo()
-            # Reference: last bar of the loop
-            ref_samples = int(self._seconds_per_bar * ref_loop.sample_rate)
-            if ref_loop.samples.shape[0] >= ref_samples:
-                ref_tail = au.Waveform(
-                    ref_loop.samples[-ref_samples:],
-                    ref_loop.sample_rate
-                ).resample(self._model_sr).as_stereo()
-            else:
-                ref_tail = ref_loop.resample(self._model_sr).as_stereo()
-            # Cross-correlation based alignment
-            def envelope(x, sr):
-                if x.ndim == 2:
-                    x = x.mean(axis=1)
-                x = np.abs(x).astype(np.float32)
-                # Simple smoothing
-                win = max(1, int(0.01 * sr))  # 10ms window
-                if win > 1:
-                    kernel = np.ones(win) / win
-                    x = np.convolve(x, kernel, mode='same')
-                return x
-            env_ref = envelope(ref_tail.samples, self._model_sr)
-            env_gen = envelope(gen_head.samples, self._model_sr)
-            # Limit search range to reasonable offset
-            max_offset_samples = int(0.2 * self._model_sr)  # 200ms max
-            # Normalize for correlation
-            env_ref = (env_ref - env_ref.mean()) / (env_ref.std() + 1e-8)
-            env_gen = (env_gen - env_gen.mean()) / (env_gen.std() + 1e-8)
-            # Find best correlation
-            best_offset = 0
-            best_corr = -1.0
-            search_len = min(len(env_ref), len(env_gen) - max_offset_samples)
-            if search_len > 0:
-                for offset in range(0, max_offset_samples, 4):  # subsample for speed
-                    if offset + search_len >= len(env_gen):
-                        break
-                    corr = np.corrcoef(env_ref[:search_len], env_gen[offset:offset+search_len])[0,1]
-                    if not np.isnan(corr) and corr > best_corr:
-                        best_corr = corr
-                        best_offset = offset
-            offset_seconds = best_offset / self._model_sr
-            print(f"Onset alignment: {offset_seconds:.3f}s offset (correlation: {best_corr:.3f})")
-            return offset_seconds
-        except Exception as e:
-            print(f"Onset alignment failed: {e}")
-            return 0.0
-    def _align_to_bar_boundary(self):
-        """Align timing state to next bar boundary"""
-        current_bar = self._timing.emit_position_bars
-        next_bar = math.ceil(current_bar)
-        if abs(next_bar - current_bar) > 1e-6:
-            skip_bars = next_bar - current_bar
-            skip_samples = self._get_precise_chunk_samples(skip_bars)
-            self._timing.stream_position_samples += skip_samples
-            self._timing.emit_position_bars = next_bar
-            print(f"Aligned to bar {next_bar:.0f}, skipped {skip_bars:.4f} bars")
     def reseed_from_waveform(self, wav):
-        """Full context replacement reseed"""
         new_state = self.mrt.init_state()
-        # Build new context from waveform
-        codec_fps = self._codec_fps
         ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
         tail = take_bar_aligned_tail(wav, self.params.bpm, self.params.beats_per_bar, ctx_seconds)
         tokens_full = self.mrt.codec.encode(tail).astype(np.int32)
         tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
-        context_tokens = make_bar_aligned_context(
-            tokens,
-            bpm=self.params.bpm,
-            fps=self._codec_fps,
             ctx_frames=self.mrt.config.context_length_frames,
-            beats_per_bar=self.params.beats_per_bar,
-            precise_timing=True
         )
         new_state.context_tokens = context_tokens
         self.state = new_state
-        # Reset stream
-        self._stream = None
-        self._stream_write_pos = 0
-        self._timing = TimingState(
-            frames_per_bar=self._frames_per_bar,
-            samples_per_bar=self._samples_per_bar_model
         )
         self._needs_bar_realign = True
-        self._reseed_ref_loop = wav
     def reseed_splice(self, recent_wav, anchor_bars: float):
-        """Token-splice reseed"""
         with self._lock:
             if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
                 self._original_context_tokens = np.copy(self.state.context_tokens)
-            # Build new context via splicing
-            recent_tokens = self._make_recent_tokens_from_wave(recent_wav)
             new_ctx = self._splice_context(self._original_context_tokens, recent_tokens, anchor_bars)
             self._pending_reseed = {"ctx": new_ctx, "ref": recent_wav}
-            # Install immediately
             new_state = self.mrt.init_state()
             new_state.context_tokens = new_ctx
             self.state = new_state
-            # Reset stream state
-            self._stream = None
-            self._stream_write_pos = 0
-            self._timing = TimingState(
-                frames_per_bar=self._frames_per_bar,
-                samples_per_bar=self._samples_per_bar_model
-            )
-            self._needs_bar_realign = True
-    def _make_recent_tokens_from_wave(self, wav) -> np.ndarray:
-        """Encode waveform to context tokens with precise alignment"""
-        tokens_full = self.mrt.codec.encode(wav).astype(np.int32)
-        tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
-        context_tokens = make_bar_aligned_context(
-            tokens,
-            bpm=self.params.bpm,
-            fps=self._codec_fps,
-            ctx_frames=self.mrt.config.context_length_frames,
-            beats_per_bar=self.params.beats_per_bar,
-            precise_timing=True
-        )
-        return context_tokens
-    def _splice_context(self, original_tokens: np.ndarray, recent_tokens: np.ndarray, anchor_bars: float) -> np.ndarray:
-        """Enhanced context splicing with fractional bar handling"""
-        ctx_frames = int(self.mrt.config.context_length_frames)
-        # Convert anchor bars to codec frames (keep fractional precision)
-        anchor_frames_f = anchor_bars * self._frames_per_bar
-        anchor_frames = int(round(anchor_frames_f))
-        # Take anchor from original
-        anchor = original_tokens[-anchor_frames:] if anchor_frames <= original_tokens.shape[0] else original_tokens
-        # Fill remainder with recent tokens
-        remain_frames = ctx_frames - anchor.shape[0]
-        if remain_frames > 0:
-            recent = recent_tokens[-remain_frames:] if remain_frames <= recent_tokens.shape[0] else recent_tokens
-        else:
-            recent = recent_tokens[:0]  # empty
-        # Combine
-        if anchor.size > 0 and recent.size > 0:
-            spliced = np.concatenate([recent, anchor], axis=0)
-        elif anchor.size > 0:
-            spliced = anchor
-        else:
-            spliced = recent_tokens[-ctx_frames:]
-        # Ensure exact length
-        if spliced.shape[0] > ctx_frames:
-            spliced = spliced[-ctx_frames:]
-        elif spliced.shape[0] < ctx_frames:
-            # Tile to fill
-            reps = int(np.ceil(ctx_frames / max(1, spliced.shape[0])))
-            spliced = np.tile(spliced, (reps, 1))[-ctx_frames:]
-        return spliced
     def run(self):
-        """Main generation loop with precise timing"""
-        chunk_bars = float(self.params.bars_per_chunk)
-        chunk_samples = self._get_precise_chunk_samples(chunk_bars)
-        xfade_s = float(self.mrt.config.crossfade_length)
-        def _samples_needed(first_chunk_extra=False):
-            """Calculate samples needed in stream for next emission"""
-            available = 0 if self._stream is None else (
-                self._stream.shape[0] - self._timing.stream_position_samples
-            )
-            required = chunk_samples
             if first_chunk_extra:
-                # Extra material for onset alignment
-                extra_samples = self._get_precise_chunk_samples(2.0)
-                required += extra_samples
-            return max(0, required - available)
-        print(f"JamWorker started: {self.params.bpm} BPM, {self._frames_per_bar:.3f} frames/bar, {chunk_bars} bars/chunk")
         while not self._stop_event.is_set():
             if not self._should_generate_next_chunk():
                 time.sleep(0.25)
                 continue
-            # 1) Generate until we have enough audio
-            needed = _samples_needed(first_chunk_extra=(self.idx == 0))
-            while needed > 0 and not self._stop_event.is_set():
                 with self._lock:
                     style_vec = self.params.style_vec
                     self.mrt.guidance_weight = float(self.params.guidance_weight)
-                    self.mrt.temperature = float(self.params.temperature)
-                    self.mrt.topk = int(self.params.topk)
                 wav, self.state = self.mrt.generate_chunk(state=self.state, style=style_vec)
-                self._append_model_chunk_to_stream(wav)
-                needed = _samples_needed(first_chunk_extra=(self.idx == 0))
             if self._stop_event.is_set():
                 break
-            # 2) First chunk: perform onset alignment
             if (self.idx == 0 and self.params.combined_loop is not None) or self._needs_bar_realign:
                 ref_loop = self._reseed_ref_loop or self.params.combined_loop
                 if ref_loop is not None:
-                    offset_seconds = self._perform_onset_alignment(ref_loop)
-                    if abs(offset_seconds) > 0.01:  # More than 10ms
-                        offset_samples = int(round(offset_seconds * self._model_sr))
-                        self._timing.stream_position_samples = max(0, offset_samples)
-                        print(f"Applied onset offset: {offset_seconds:.3f}s")
-                self._align_to_bar_boundary()
                 self._needs_bar_realign = False
                 self._reseed_ref_loop = None
-            # 3) Extract precise chunk
-            chunk_start_bars = self._timing.emit_position_bars
-            slice_audio = self._extract_precise_chunk(chunk_start_bars, chunk_bars)
-            if slice_audio is None:
-                continue  # Need more generation
-            # Update timing state
-            correction = self._timing.advance_by_bars(chunk_bars)
-            if correction != 0:
-                print(f"Applied {correction} bar timing correction")
-            self._timing.stream_position_samples += chunk_samples
-            # 4) Create waveform and process
-            y = au.Waveform(slice_audio.astype(np.float32, copy=False), self._model_sr).as_stereo()
-            # Loudness matching and fades
             if self.idx == 0 and self.params.ref_loop is not None:
                 y, _ = match_loudness_to_reference(
                     self.params.ref_loop, y,
@@ -530,96 +539,38 @@ class JamWorker(threading.Thread):
             else:
                 apply_micro_fades(y, 3)
-            # 5) Sample rate conversion
-            if self._resampler is not None:
-                # Use streaming resampler for precise conversion
-                resampled = self._resampler.process(y.samples, final=False)
-                # Ensure exact target length
-                target_samples = int(round(chunk_bars * self._samples_per_bar_target))
-                if resampled.shape[0] != target_samples:
-                    if resampled.shape[0] < target_samples:
-                        pad_samples = target_samples - resampled.shape[0]
-                        pad = np.zeros((pad_samples, resampled.shape[1]), dtype=resampled.dtype)
-                        resampled = np.vstack([resampled, pad])
-                    else:
-                        resampled = resampled[:target_samples]
-                final_audio = resampled
-                final_sr = self._target_sr
-            else:
-                # No resampling needed
-                final_audio = y.samples
-                final_sr = self._model_sr
-            # 6) Encode to base64
-            b64, total_samples, channels = wav_bytes_base64(final_audio, final_sr)
-            # 7) Create metadata with timing info
-            actual_duration = total_samples / final_sr
-            bar_range = f"{chunk_start_bars:.2f}-{self._timing.emit_position_bars:.2f}"
-            meta = {
-                "bpm": int(round(self.params.bpm)),
-                "bars": int(self.params.bars_per_chunk),
-                "beats_per_bar": int(self.params.beats_per_bar),
-                "sample_rate": int(final_sr),
-                "channels": int(channels),
-                "total_samples": int(total_samples),
-                "seconds_per_bar": self._seconds_per_bar,
-                "loop_duration_seconds": actual_duration,
-                "bar_range": bar_range,
-                "timing_state": {
-                    "emit_position_bars": self._timing.emit_position_bars,
-                    "frames_per_bar": self._frames_per_bar,
-                    "fractional_error": self._timing.fractional_error_bars,
-                },
-                "xfade_seconds": xfade_s,
-                "guidance_weight": self.params.guidance_weight,
-                "temperature": self.params.temperature,
-                "topk": self.params.topk,
-            }
-            # 8) Publish chunk
             with self._lock:
                 self.idx += 1
-                chunk = JamChunk(index=self.idx, audio_base64=b64, metadata=meta)
-                self.outbox.append(chunk)
-                # Cleanup old chunks
                 if len(self.outbox) > 10:
                     cutoff = self._last_delivered_index - 5
                     self.outbox = [ch for ch in self.outbox if ch.index > cutoff]
-                # Handle pending reseeds
                 if self._pending_reseed is not None:
                     pkg = self._pending_reseed
                     self._pending_reseed = None
                     new_state = self.mrt.init_state()
-                    new_state.context_tokens = pkg["ctx"]
                     self.state = new_state
-                    # Reset timing and stream
                     self._stream = None
-                    self._stream_write_pos = 0
-                    self._timing = TimingState(
-                        frames_per_bar=self._frames_per_bar,
-                        samples_per_bar=self._samples_per_bar_model
-                    )
-                    self._reseed_ref_loop = pkg.get("ref")
                     self._needs_bar_realign = True
-                    print("Reseed applied at bar boundary")
-            drift_ms = abs(self._timing.fractional_error_bars) * self._seconds_per_bar * 1000
-            print(f"Completed chunk {self.idx} ({bar_range} bars, {drift_ms:.1f}ms drift)")
-        print("JamWorker stopped")
-        # Clean up resampler
-        if self._resampler is not None:
-            try:
-                self._resampler.flush()
-            except:
-                pass

+# jam_worker.py - SIMPLE FIX VERSION
+import threading, time, base64, io, uuid
 from dataclasses import dataclass, field
 import numpy as np
 import soundfile as sf
 from utils import (
     match_loudness_to_reference, stitch_generated, hard_trim_seconds,
     apply_micro_fades, make_bar_aligned_context, take_bar_aligned_tail,
+    resample_and_snap, wav_bytes_base64
 )
 @dataclass
     audio_base64: str
     metadata: dict
 class JamWorker(threading.Thread):
     def __init__(self, mrt, params: JamParams):
         super().__init__(daemon=True)
         self.params = params
         self.state = mrt.init_state()
+        # ✅ init synchronization + placeholders FIRST
         self._lock = threading.Lock()
+        self._original_context_tokens = None   # so hasattr checks are cheap/clear
         if params.combined_loop is not None:
             self._setup_context_from_combined_loop()
         self.outbox: list[JamChunk] = []
         self._stop_event = threading.Event()
         self._stream = None
+        self._next_emit_start = 0
+        # NEW: Track delivery state
         self._last_delivered_index = 0
         self._max_buffer_ahead = 5
         # Timing info
         self.last_chunk_started_at = None
         self.last_chunk_completed_at = None
+        self._pending_reseed = None        # {"ctx": np.ndarray, "ref": au.Waveform|None}
+        self._needs_bar_realign = False    # request a one-shot downbeat alignment
+        self._reseed_ref_loop = None       # which loop to align against after reseed
     def _setup_context_from_combined_loop(self):
         """Set up MRT context tokens from the combined loop audio"""
         try:
             from utils import make_bar_aligned_context, take_bar_aligned_tail
+            codec_fps = float(self.mrt.codec.frame_rate)
             ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
             loop_for_context = take_bar_aligned_tail(
             tokens_full = self.mrt.codec.encode(loop_for_context).astype(np.int32)
             tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
             context_tokens = make_bar_aligned_context(
                 tokens,
                 bpm=self.params.bpm,
+                fps=float(self.mrt.codec.frame_rate),  # keep fractional fps
                 ctx_frames=self.mrt.config.context_length_frames,
+                beats_per_bar=self.params.beats_per_bar
             )
+            # Install fresh context
             self.state.context_tokens = context_tokens
+            print(f"✅ JamWorker: Set up fresh context from combined loop")
+            # NEW: keep a copy of the *original* context tokens for future splice-reseed
+            # (guard so we only set this once, at jam start)
             with self._lock:
                 if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
+                    self._original_context_tokens = np.copy(context_tokens)  # shape: [T, depth]
         except Exception as e:
+            print(f"❌ Failed to setup context from combined loop: {e}")
     def stop(self):
         self._stop_event.set()
     def update_knobs(self, *, guidance_weight=None, temperature=None, topk=None):
         with self._lock:
+            if guidance_weight is not None: self.params.guidance_weight = float(guidance_weight)
+            if temperature is not None:     self.params.temperature     = float(temperature)
+            if topk is not None:            self.params.topk            = int(topk)
     def get_next_chunk(self) -> JamChunk | None:
         """Get the next sequential chunk (blocks/waits if not ready)"""
         target_index = self._last_delivered_index + 1
+        # Wait for the target chunk to be ready (with timeout)
+        max_wait = 30.0  # seconds
         start_time = time.time()
         while time.time() - start_time < max_wait and not self._stop_event.is_set():
             with self._lock:
+                # Look for the exact chunk we need
                 for chunk in self.outbox:
                     if chunk.index == target_index:
                         self._last_delivered_index = target_index
+                        print(f"📦 Delivered chunk {target_index}")
                         return chunk
+            # Not ready yet, wait a bit
             time.sleep(0.1)
+        # Timeout or stopped
         return None
     def mark_chunk_consumed(self, chunk_index: int):
         """Mark a chunk as consumed by the frontend"""
         with self._lock:
             self._last_delivered_index = max(self._last_delivered_index, chunk_index)
+            print(f"✅ Chunk {chunk_index} consumed")
     def _should_generate_next_chunk(self) -> bool:
+        """Check if we should generate the next chunk (don't get too far ahead)"""
         with self._lock:
+            # Don't generate if we're already too far ahead
+            if self.idx > self._last_delivered_index + self._max_buffer_ahead:
+                return False
+            return True
+    def _seconds_per_bar(self) -> float:
+        return self.params.beats_per_bar * (60.0 / self.params.bpm)
+    def _snap_and_encode(self, y, seconds, target_sr, bars):
+        cur_sr = int(self.mrt.sample_rate)
+        x = y.samples if y.samples.ndim == 2 else y.samples[:, None]
+        x = resample_and_snap(x, cur_sr=cur_sr, target_sr=target_sr, seconds=seconds)
+        b64, total_samples, channels = wav_bytes_base64(x, target_sr)
+        meta = {
+            "bpm": int(round(self.params.bpm)),
+            "bars": int(bars),
+            "beats_per_bar": int(self.params.beats_per_bar),
+            "sample_rate": int(target_sr),
+            "channels": channels,
+            "total_samples": total_samples,
+            "seconds_per_bar": self._seconds_per_bar(),
+            "loop_duration_seconds": bars * self._seconds_per_bar(),
+            "guidance_weight": self.params.guidance_weight,
+            "temperature": self.params.temperature,
+            "topk": self.params.topk,
+        }
+        return b64, meta
     def _append_model_chunk_to_stream(self, wav):
+        """Incrementally append a model chunk with equal-power crossfade."""
         xfade_s = float(self.mrt.config.crossfade_length)
+        sr = int(self.mrt.sample_rate)
         xfade_n = int(round(xfade_s * sr))
         s = wav.samples if wav.samples.ndim == 2 else wav.samples[:, None]
+        if getattr(self, "_stream", None) is None:
+            # First chunk: drop model pre-roll (xfade head)
             if s.shape[0] > xfade_n:
                 self._stream = s[xfade_n:].astype(np.float32, copy=True)
             else:
                 self._stream = np.zeros((0, s.shape[1]), dtype=np.float32)
+            self._next_emit_start = 0  # pointer into _stream (model SR samples)
             return
+        # Crossfade last xfade_n samples of _stream with head of new s
         if s.shape[0] <= xfade_n or self._stream.shape[0] < xfade_n:
+            # Degenerate safeguard
             self._stream = np.concatenate([self._stream, s], axis=0)
             return
         tail = self._stream[-xfade_n:]
         head = s[:xfade_n]
+        # Equal-power envelopes
         t = np.linspace(0, np.pi/2, xfade_n, endpoint=False, dtype=np.float32)[:, None]
         eq_in, eq_out = np.sin(t), np.cos(t)
         mixed = tail * eq_out + head * eq_in
         self._stream = np.concatenate([self._stream[:-xfade_n], mixed, s[xfade_n:]], axis=0)
     def reseed_from_waveform(self, wav):
+        # 1) Re-init state
         new_state = self.mrt.init_state()
+        # 2) Build bar-aligned context tokens from provided audio
+        codec_fps   = float(self.mrt.codec.frame_rate)
         ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
+        from utils import take_bar_aligned_tail, make_bar_aligned_context
         tail = take_bar_aligned_tail(wav, self.params.bpm, self.params.beats_per_bar, ctx_seconds)
         tokens_full = self.mrt.codec.encode(tail).astype(np.int32)
         tokens = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
+        context_tokens = make_bar_aligned_context(tokens,
+            bpm=self.params.bpm, fps=float(self.mrt.codec.frame_rate),
             ctx_frames=self.mrt.config.context_length_frames,
+            beats_per_bar=self.params.beats_per_bar
         )
         new_state.context_tokens = context_tokens
         self.state = new_state
+        self._prepare_stream_for_reseed_handoff()
+    def _frames_per_bar(self) -> int:
+        # codec frame-rate (frames/s) -> frames per musical bar
+        fps = float(self.mrt.codec.frame_rate)
+        sec_per_bar = (60.0 / float(self.params.bpm)) * float(self.params.beats_per_bar)
+        return int(round(fps * sec_per_bar))
+    def _ctx_frames(self) -> int:
+        # how many codec frames fit in the model’s conditioning window
+        return int(self.mrt.config.context_length_frames)
+    def _make_recent_tokens_from_wave(self, wav) -> np.ndarray:
+        """
+        Encode waveform and produce a BAR-ALIGNED context token window.
+        """
+        tokens_full = self.mrt.codec.encode(wav).astype(np.int32)           # [T, rvq_total]
+        tokens      = tokens_full[:, :self.mrt.config.decoder_codec_rvq_depth]
+        from utils import make_bar_aligned_context
+        ctx = make_bar_aligned_context(
+            tokens,
+            bpm=self.params.bpm,
+            fps=float(self.mrt.codec.frame_rate),  # keep fractional fps
+            ctx_frames=self.mrt.config.context_length_frames,
+            beats_per_bar=self.params.beats_per_bar
         )
+        return ctx
+    def _bar_aligned_tail(self, tokens: np.ndarray, bars: float) -> np.ndarray:
+        """
+        Take a tail slice that is an integer number of codec frames corresponding to `bars`.
+        We round to nearest frame to stay phase-consistent with codec grid.
+        """
+        frames_per_bar = self._frames_per_bar()
+        want = max(frames_per_bar * int(round(bars)), 0)
+        if want == 0:
+            return tokens[:0]  # empty
+        if tokens.shape[0] <= want:
+            return tokens
+        return tokens[-want:]
+    def _splice_context(self, original_tokens: np.ndarray, recent_tokens: np.ndarray,
+                    anchor_bars: float) -> np.ndarray:
+        import math
+        ctx_frames = self._ctx_frames()
+        depth = original_tokens.shape[1]
+        frames_per_bar = self._frames_per_bar()
+        # 1) Anchor tail (whole bars)
+        anchor = self._bar_aligned_tail(original_tokens, math.floor(anchor_bars))
+        # 2) Fill remainder with recent (prefer whole bars)
+        a = anchor.shape[0]
+        remain = max(ctx_frames - a, 0)
+        recent = recent_tokens[:0]
+        used_recent = 0  # frames taken from the END of recent_tokens
+        if remain > 0:
+            bars_fit = remain // frames_per_bar
+            if bars_fit >= 1:
+                want_recent_frames = int(bars_fit * frames_per_bar)
+                used_recent = min(want_recent_frames, recent_tokens.shape[0])
+                recent = recent_tokens[-used_recent:] if used_recent > 0 else recent_tokens[:0]
+            else:
+                used_recent = min(remain, recent_tokens.shape[0])
+                recent = recent_tokens[-used_recent:] if used_recent > 0 else recent_tokens[:0]
+        # 3) Concat in order [anchor, recent]
+        if anchor.size or recent.size:
+            out = np.concatenate([anchor, recent], axis=0)
+        else:
+            # fallback: just take the last ctx window from recent
+            out = recent_tokens[-ctx_frames:]
+        # 4) Trim if we overshot
+        if out.shape[0] > ctx_frames:
+            out = out[-ctx_frames:]
+        # 5) Snap the **END** to the nearest LOWER bar boundary
+        if frames_per_bar > 0:
+            max_bar_aligned = (out.shape[0] // frames_per_bar) * frames_per_bar
+        else:
+            max_bar_aligned = out.shape[0]
+        if max_bar_aligned > 0 and out.shape[0] != max_bar_aligned:
+            out = out[-max_bar_aligned:]
+        # 6) Left-fill to reach ctx_frames **without moving the END**
+        deficit = ctx_frames - out.shape[0]
+        if deficit > 0:
+            left_parts = []
+            # Prefer frames immediately BEFORE the region we used from 'recent_tokens'
+            if used_recent < recent_tokens.shape[0]:
+                take = min(deficit, recent_tokens.shape[0] - used_recent)
+                if used_recent > 0:
+                    left_parts.append(recent_tokens[-(used_recent + take) : -used_recent])
+                else:
+                    left_parts.append(recent_tokens[-take:])
+            # Then take frames immediately BEFORE the 'anchor' in original_tokens
+            if sum(p.shape[0] for p in left_parts) < deficit and anchor.shape[0] > 0:
+                need = deficit - sum(p.shape[0] for p in left_parts)
+                a_len = anchor.shape[0]
+                avail = max(original_tokens.shape[0] - a_len, 0)
+                take2 = min(need, avail)
+                if take2 > 0:
+                    left_parts.append(original_tokens[-(a_len + take2) : -a_len])
+            # Still short? tile from what's available
+            have = sum(p.shape[0] for p in left_parts)
+            if have < deficit:
+                base = out if out.shape[0] > 0 else (recent_tokens if recent_tokens.shape[0] > 0 else original_tokens)
+                reps = int(np.ceil((deficit - have) / max(1, base.shape[0])))
+                left_parts.append(np.tile(base, (reps, 1))[: (deficit - have)])
+            left = np.concatenate(left_parts, axis=0)
+            out = np.concatenate([left[-deficit:], out], axis=0)
+        # 7) Final guard to exact length
+        if out.shape[0] > ctx_frames:
+            out = out[-ctx_frames:]
+        elif out.shape[0] < ctx_frames:
+            reps = int(np.ceil(ctx_frames / max(1, out.shape[0])))
+            out = np.tile(out, (reps, 1))[-ctx_frames:]
+        # 8) Depth guard
+        if out.shape[1] != depth:
+            out = out[:, :depth]
+        return out
+    def _realign_emit_pointer_to_bar(self, sr_model: int):
+        """Advance _next_emit_start to the next bar boundary in model-sample space."""
+        bar_samps = int(round(self._seconds_per_bar() * sr_model))
+        if bar_samps <= 0:
+            return
+        phase = self._next_emit_start % bar_samps
+        if phase != 0:
+            self._next_emit_start += (bar_samps - phase)
+    def _prepare_stream_for_reseed_handoff(self):
+        # OLD: keep crossfade tail -> causes phase offset
+        # sr = int(self.mrt.sample_rate)
+        # xfade_s = float(self.mrt.config.crossfade_length)
+        # xfade_n = int(round(xfade_s * sr))
+        # if getattr(self, "_stream", None) is not None and self._stream.shape[0] > 0:
+        #     tail = self._stream[-xfade_n:] if self._stream.shape[0] > xfade_n else self._stream
+        #     self._stream = tail.copy()
+        # else:
+        #     self._stream = None
+        # NEW: throw away the tail completely; start fresh
+        self._stream = None
+        self._next_emit_start = 0
         self._needs_bar_realign = True
     def reseed_splice(self, recent_wav, anchor_bars: float):
+        """
+        Token-splice reseed queued for the next bar boundary between chunks.
+        """
         with self._lock:
             if not hasattr(self, "_original_context_tokens") or self._original_context_tokens is None:
                 self._original_context_tokens = np.copy(self.state.context_tokens)
+            recent_tokens = self._make_recent_tokens_from_wave(recent_wav)  # [T, depth]
             new_ctx = self._splice_context(self._original_context_tokens, recent_tokens, anchor_bars)
+            # Queue it; the run loop will install right after we finish the current slice
             self._pending_reseed = {"ctx": new_ctx, "ref": recent_wav}
+            # install the new context window
             new_state = self.mrt.init_state()
             new_state.context_tokens = new_ctx
             self.state = new_state
+            self._prepare_stream_for_reseed_handoff()
+            # optional: ask streamer to drop an intro crossfade worth of audio right after reseed
+            self._pending_drop_intro_bars = getattr(self, "_pending_drop_intro_bars", 0) + 1
     def run(self):
+        """Main worker loop — generate into a continuous stream, then emit bar-aligned slices."""
+        spb = self._seconds_per_bar()                     # seconds per bar
+        chunk_secs = self.params.bars_per_chunk * spb
+        xfade = float(self.mrt.config.crossfade_length)   # seconds
+        sr = int(self.mrt.sample_rate)
+        chunk_samps = int(round(chunk_secs * sr))
+        def _need(first_chunk_extra=False):
+            """How many more samples we still need in the stream to emit next slice."""
+            have = 0 if getattr(self, "_stream", None) is None else self._stream.shape[0] - getattr(self, "_next_emit_start", 0)
+            want = chunk_samps
             if first_chunk_extra:
+                # reserve two bars extra so first-chunk onset alignment has material
+                want += int(round(2 * spb * sr))
+            return max(0, want - have)
+        def _mono_env(x: np.ndarray, sr: int, win_ms: float = 10.0) -> np.ndarray:
+            if x.ndim == 2: x = x.mean(axis=1)
+            x = np.abs(x).astype(np.float32)
+            w = max(1, int(round(win_ms * 1e-3 * sr)))
+            if w > 1:
+                kern = np.ones(w, dtype=np.float32) / float(w)
+                x = np.convolve(x, kern, mode="same")
+            d = np.diff(x, prepend=x[:1])
+            d[d < 0] = 0.0
+            return d
+        def _estimate_first_offset_samples(ref_loop_wav, gen_head_wav, sr: int, spb: float) -> int:
+            """Tempo-aware first-downbeat offset (positive => model late)."""
+            try:
+                max_ms = int(max(160.0, min(0.25 * spb * 1000.0, 450.0)))
+                ref = ref_loop_wav if ref_loop_wav.sample_rate == sr else ref_loop_wav.resample(sr)
+                n_bar = int(round(spb * sr))
+                ref_tail = ref.samples[-n_bar:, :] if ref.samples.shape[0] >= n_bar else ref.samples
+                gen_head = gen_head_wav.samples[: int(2 * n_bar), :]
+                if ref_tail.size == 0 or gen_head.size == 0:
+                    return 0
+                # envelopes + z-score
+                import numpy as np
+                def _z(a):
+                    m, s = float(a.mean()), float(a.std() or 1.0); return (a - m) / s
+                e_ref = _z(_mono_env(ref_tail, sr)).astype(np.float32)
+                e_gen = _z(_mono_env(gen_head, sr)).astype(np.float32)
+                # upsample x4 for finer lag
+                def _upsample(a, r=4):
+                    n = len(a); grid = np.arange(n, dtype=np.float32)
+                    fine = np.linspace(0, n - 1, num=n * r, dtype=np.float32)
+                    return np.interp(fine, grid, a).astype(np.float32)
+                up = 4
+                e_ref_u, e_gen_u = _upsample(e_ref, up), _upsample(e_gen, up)
+                max_lag_u = int(round((max_ms / 1000.0) * sr * up))
+                seg = min(len(e_ref_u), len(e_gen_u))
+                e_ref_u = e_ref_u[-seg:]
+                pad = np.zeros(max_lag_u, dtype=np.float32)
+                e_gen_u_pad = np.concatenate([pad, e_gen_u, pad])
+                best_lag_u, best_score = 0, -1e9
+                for lag_u in range(-max_lag_u, max_lag_u + 1):
+                    start = max_lag_u + lag_u
+                    b = e_gen_u_pad[start : start + seg]
+                    denom = (np.linalg.norm(e_ref_u) * np.linalg.norm(b)) or 1.0
+                    score = float(np.dot(e_ref_u, b) / denom)
+                    if score > best_score:
+                        best_score, best_lag_u = score, lag_u
+                return int(round(best_lag_u / up))
+            except Exception:
+                return 0
+        print("🚀 JamWorker started (bar-aligned streaming)…")
         while not self._stop_event.is_set():
             if not self._should_generate_next_chunk():
                 time.sleep(0.25)
                 continue
+            # 1) Generate until we have enough material in the stream
+            need = _need(first_chunk_extra=(self.idx == 0))
+            while need > 0 and not self._stop_event.is_set():
                 with self._lock:
                     style_vec = self.params.style_vec
                     self.mrt.guidance_weight = float(self.params.guidance_weight)
+                    self.mrt.temperature     = float(self.params.temperature)
+                    self.mrt.topk            = int(self.params.topk)
                 wav, self.state = self.mrt.generate_chunk(state=self.state, style=style_vec)
+                self._append_model_chunk_to_stream(wav)   # equal-power xfade into a persistent stream
+                need = _need(first_chunk_extra=(self.idx == 0))
             if self._stop_event.is_set():
                 break
+            # 2) One-time: align the emit pointer to the groove
             if (self.idx == 0 and self.params.combined_loop is not None) or self._needs_bar_realign:
                 ref_loop = self._reseed_ref_loop or self.params.combined_loop
                 if ref_loop is not None:
+                    head_len = min(self._stream.shape[0] - self._next_emit_start, int(round(2 * spb * sr)))
+                    seg = self._stream[self._next_emit_start : self._next_emit_start + head_len]
+                    gen_head = au.Waveform(seg.astype(np.float32, copy=False), sr).as_stereo()
+                    offs = _estimate_first_offset_samples(ref_loop, gen_head, sr, spb)
+                    if offs != 0:
+                        self._next_emit_start = max(0, self._next_emit_start + offs)
+                        print(f"🎯 Offset compensation: {offs/sr:+.3f}s")
+                    self._realign_emit_pointer_to_bar(sr)
                 self._needs_bar_realign = False
                 self._reseed_ref_loop = None
+            # 3) Emit exactly bars_per_chunk × spb from the stream
+            start = self._next_emit_start
+            end = start + chunk_samps
+            if end > self._stream.shape[0]:
+                # shouldn't happen often; generate a bit more and loop
+                continue
+            slice_ = self._stream[start:end]
+            self._next_emit_start = end
+            y = au.Waveform(slice_.astype(np.float32, copy=False), sr).as_stereo()
+            # 4) Post-processing / loudness
             if self.idx == 0 and self.params.ref_loop is not None:
                 y, _ = match_loudness_to_reference(
                     self.params.ref_loop, y,
             else:
                 apply_micro_fades(y, 3)
+            # 5) Resample + exact-length snap + encode
+            b64, meta = self._snap_and_encode(
+                y, seconds=chunk_secs, target_sr=self.params.target_sr, bars=self.params.bars_per_chunk
+            )
+            meta["xfade_seconds"] = xfade
+            # 6) Publish
             with self._lock:
                 self.idx += 1
+                self.outbox.append(JamChunk(index=self.idx, audio_base64=b64, metadata=meta))
                 if len(self.outbox) > 10:
                     cutoff = self._last_delivered_index - 5
                     self.outbox = [ch for ch in self.outbox if ch.index > cutoff]
+                # 👉 If a reseed was requested, apply it *now*, between chunks
                 if self._pending_reseed is not None:
                     pkg = self._pending_reseed
                     self._pending_reseed = None
                     new_state = self.mrt.init_state()
+                    new_state.context_tokens = pkg["ctx"]          # exact (ctx_frames, depth)
                     self.state = new_state
+                    # start a fresh stream and schedule one-time alignment
                     self._stream = None
+                    self._next_emit_start = 0
+                    self._reseed_ref_loop = pkg.get("ref") or self.params.combined_loop
                     self._needs_bar_realign = True
+                    print("🔁 Reseed installed at bar boundary; will realign before next slice")
+            print(f"✅ Completed chunk {self.idx}")
+        print("🛑 JamWorker stopped")

utils.py CHANGED Viewed

@@ -109,81 +109,55 @@ def apply_micro_fades(wav: au.Waveform, ms: int = 5) -> None:
 # ---------- Token context helpers ----------
-def make_bar_aligned_context(tokens, bpm, fps=25.0, ctx_frames=250, beats_per_bar=4, precise_timing=False):
     """
     Return a ctx_frames-long slice of `tokens` whose **end** lands on the nearest
-    whole-bar boundary in codec-frame space.
-    NEW: precise_timing mode handles fractional frames per bar more carefully.
     """
     if tokens is None:
         raise ValueError("tokens is None")
     tokens = np.asarray(tokens)
     if tokens.ndim == 1:
-        tokens = tokens[:, None]
     T = tokens.shape[0]
     if T == 0:
         return tokens
     fps = float(fps)
-    frames_per_bar_f = (beats_per_bar * 60.0 / float(bpm)) * fps
-    if precise_timing and abs(frames_per_bar_f - round(frames_per_bar_f)) > 1e-6:
-        # We have fractional frames per bar - use a different strategy
-        # Instead of trying to align to exact bar boundaries, align to the closest
-        # multiple of frames_per_bar_f that gives us integer frame positions
-        # Tile enough to work with
-        reps = max(2, int(np.ceil((ctx_frames + T) / float(T))))
-        tiled = np.tile(tokens, (reps, 1))
-        total = tiled.shape[0]
-        # Find the best integer end position that's close to a bar boundary
-        best_end = ctx_frames
-        best_error = float('inf')
-        # Check positions around the naive ctx_frames endpoint
-        for candidate_end in range(max(ctx_frames - 50, ctx_frames), min(total, ctx_frames + 50)):
-            # How many fractional bars does this represent?
-            fractional_bars = candidate_end / frames_per_bar_f
-            # How far from an integer number of bars?
-            bar_error = abs(fractional_bars - round(fractional_bars))
-            if bar_error < best_error:
-                best_error = bar_error
-                best_end = candidate_end
-        end_idx = best_end
-        start_idx = max(0, end_idx - ctx_frames)
-        window = tiled[start_idx:end_idx]
-        # Report timing info for debugging
-        actual_bars = end_idx / frames_per_bar_f
-        print(f"Context aligned to {actual_bars:.3f} bars (error: {best_error:.4f})")
-    else:
-        # Original logic for integer frames per bar
-        reps = int(np.ceil((ctx_frames + T) / float(T))) + 1
-        tiled = np.tile(tokens, (reps, 1))
-        total = tiled.shape[0]
-        k_bars = int(np.floor(total / frames_per_bar_f))
-        if k_bars <= 0:
-            window = tiled[-ctx_frames:]
-            return window
-        end_idx = int(round(k_bars * frames_per_bar_f))
-        end_idx = min(max(end_idx, ctx_frames), total)
-        start_idx = end_idx - ctx_frames
-        if start_idx < 0:
-            start_idx = 0
-            end_idx = ctx_frames
-        window = tiled[start_idx:end_idx]
-    # Ensure exact length
     if window.shape[0] < ctx_frames:
         pad = np.tile(tokens, (int(np.ceil((ctx_frames - window.shape[0]) / T)), 1))
         window = np.vstack([window, pad])[:ctx_frames]

 # ---------- Token context helpers ----------
+def make_bar_aligned_context(tokens, bpm, fps=25.0, ctx_frames=250, beats_per_bar=4):
     """
     Return a ctx_frames-long slice of `tokens` whose **end** lands on the nearest
+    whole-bar boundary in codec-frame space, even when frames_per_bar is fractional.
+    tokens: np.ndarray of shape (T, D) or (T,) where T = codec frames
+    bpm: float
+    fps: float (codec frames per second; keep this as float)
+    ctx_frames: int (length of context window in codec frames)
+    beats_per_bar: int
     """
     if tokens is None:
         raise ValueError("tokens is None")
     tokens = np.asarray(tokens)
     if tokens.ndim == 1:
+        tokens = tokens[:, None]  # promote to (T, 1) for uniform tiling
     T = tokens.shape[0]
     if T == 0:
         return tokens
     fps = float(fps)
+    frames_per_bar_f = (beats_per_bar * 60.0 / float(bpm)) * fps  # float frames per bar
+    # Tile a little more than we need so we can always snap the END to a bar boundary
+    reps = int(np.ceil((ctx_frames + T) / float(T))) + 1
+    tiled = np.tile(tokens, (reps, 1))
+    total = tiled.shape[0]
+    # How many whole bars fit?
+    k_bars = int(np.floor(total / frames_per_bar_f))
+    if k_bars <= 0:
+        # Fallback: just take the last ctx_frames
+        window = tiled[-ctx_frames:]
+        return window
+    # Snap END index to the nearest integer frame at a whole-bar boundary
+    end_idx = int(round(k_bars * frames_per_bar_f))
+    end_idx = min(max(end_idx, ctx_frames), total)
+    start_idx = end_idx - ctx_frames
+    if start_idx < 0:
+        start_idx = 0
+        end_idx = ctx_frames
+    window = tiled[start_idx:end_idx]
+    # Guard against rare off-by-one due to rounding
     if window.shape[0] < ctx_frames:
         pad = np.tile(tokens, (int(np.ceil((ctx_frames - window.shape[0]) / T)), 1))
         window = np.vstack([window, pad])[:ctx_frames]