From 211000084e0179d7e087c31a7214d72ac8b65bb0 Mon Sep 17 00:00:00 2001
From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com>
Date: Thu, 11 Jun 2026 17:45:54 +0200
Subject: [PATCH 1/7] Configure native audio source from device, not hardcoded
 defaults
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The native (Rust) audio source was created with a hardcoded sample rate
(48000) and channel count (2). Microphone frames flow through Unity's
audio graph (AudioProbe) at the actual DSP output configuration, which
often differs — e.g. with a Bluetooth headset. The Rust source does not
resample; it rejects frames whose rate/channels don't match, causing the
metadata-mismatch warning and capture failures.

Read the source's sample rate and channel count from Unity's output
configuration (AudioSettings.GetConfiguration) instead of hardcoded
defaults, falling back to the defaults only when Unity can't report one.
The base constructor now exposes a device-mode overload (type only) and an
explicit overload (type, sampleRate, channels) for sources that generate a
fixed format. MicrophoneSource and BasicAudioSource use device mode;
BasicAudioSource drops its unused channels parameter. SineWaveAudioSource
declares its exact format.

If a frame's format still doesn't match (inconsistent Unity report or a
runtime output change), drop it with a throttled warning instead of
sending a mismatch the native side would error on. Also removes the
redundant Microphone.Start in the Meet sample.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Runtime/Scripts/BasicAudioSource.cs         |  6 +-
 Runtime/Scripts/MicrophoneSource.cs         |  2 +-
 Runtime/Scripts/RtcAudioSource.cs           | 81 ++++++++++++++++++---
 Samples~/Meet/Assets/Runtime/MeetManager.cs |  3 +-
 Tests/PlayMode/Utils/SineWaveAudioSource.cs |  2 +-
 5 files changed, 79 insertions(+), 15 deletions(-)
diff --git a/Runtime/Scripts/BasicAudioSource.cs b/Runtime/Scripts/BasicAudioSource.cs
index 3b63680b..8193090d 100644
--- a/Runtime/Scripts/BasicAudioSource.cs
+++ b/Runtime/Scripts/BasicAudioSource.cs
@@ -19,9 +19,11 @@ sealed public class BasicAudioSource : RtcAudioSource
         /// Creates a new basic audio source for the given <see cref="AudioSource"/> in the scene.
         /// </summary>
         /// <param name="source">The <see cref="AudioSource"/> to capture from.</param>
-        /// <param name="channels">The number of channels to capture.</param>
         /// <param name="sourceType">The type of audio source.</param>
-        public BasicAudioSource(AudioSource source, int channels = 2, RtcAudioSourceType sourceType = RtcAudioSourceType.AudioSourceCustom) : base(channels, sourceType)
+        /// <remarks>
+        /// The sample rate and channel count are taken from Unity's audio configuration.
+        /// </remarks>
+        public BasicAudioSource(AudioSource source, RtcAudioSourceType sourceType = RtcAudioSourceType.AudioSourceCustom) : base(sourceType)
         {
             _source = source;
         }
diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs
index 904b8da7..a8775568 100644
--- a/Runtime/Scripts/MicrophoneSource.cs
+++ b/Runtime/Scripts/MicrophoneSource.cs
@@ -28,7 +28,7 @@ sealed public class MicrophoneSource : RtcAudioSource
         /// get the list of available devices.</param>
         /// <param name="sourceObject">The GameObject to attach the AudioSource to. The object must be kept in the scene
         /// for the duration of the source's lifetime.</param>
-        public MicrophoneSource(string deviceName, GameObject sourceObject) : base(2, RtcAudioSourceType.AudioSourceMicrophone)
+        public MicrophoneSource(string deviceName, GameObject sourceObject) : base(RtcAudioSourceType.AudioSourceMicrophone)
         {
             _deviceName = deviceName;
             _sourceObject = sourceObject;
diff --git a/Runtime/Scripts/RtcAudioSource.cs b/Runtime/Scripts/RtcAudioSource.cs
index a9af8a0a..7f5c3d7c 100644
--- a/Runtime/Scripts/RtcAudioSource.cs
+++ b/Runtime/Scripts/RtcAudioSource.cs
@@ -83,20 +83,33 @@ private sealed class PendingAudioFrame
         private volatile bool _disposed = false;
         private int _audioReadCount = 0;
 
-        protected RtcAudioSource(int channels = 2, RtcAudioSourceType audioSourceType = RtcAudioSourceType.AudioSourceCustom)
+        // Device-capture sources (microphone, AudioSource taps) don't know their format ahead of
+        // time — it is whatever Unity's audio graph delivers. They use this constructor, which
+        // configures the native source from Unity's current output configuration.
+        protected RtcAudioSource(RtcAudioSourceType audioSourceType)
+            : this(audioSourceType, 0, 0) { }
+
+        // Sources that generate a fixed, known format (e.g. test signal generators) declare it
+        // directly. Passing 0 for either value falls back to the device configuration.
+        protected RtcAudioSource(RtcAudioSourceType audioSourceType, uint sampleRate, uint channels)
         {
             _sourceType = audioSourceType;
-            _expectedChannels = (uint)channels;
+
+            if (sampleRate > 0 && channels > 0)
+            {
+                _expectedSampleRate = sampleRate;
+                _expectedChannels = channels;
+            }
+            else
+            {
+                (_expectedSampleRate, _expectedChannels) = ResolveDeviceFormat();
+            }
 
             using var request = FFIBridge.Instance.NewRequest<NewAudioSourceRequest>();
             var newAudioSource = request.request;
             newAudioSource.Type = AudioSourceType.AudioSourceNative;
-            newAudioSource.NumChannels = (uint)channels;
-            newAudioSource.SampleRate = _sourceType == RtcAudioSourceType.AudioSourceMicrophone ?
-                DefaultMicrophoneSampleRate : DefaultSampleRate;
-            _expectedSampleRate = newAudioSource.SampleRate;
-
-            Utils.Debug($"NewAudioSource: {newAudioSource.NumChannels} {newAudioSource.SampleRate}");
+            newAudioSource.NumChannels = _expectedChannels;
+            newAudioSource.SampleRate = _expectedSampleRate;
 
             newAudioSource.Options = request.TempResource<AudioSourceOptions>();
             newAudioSource.Options.EchoCancellation = true;
@@ -109,6 +122,49 @@ protected RtcAudioSource(int channels = 2, RtcAudioSourceType audioSourceType =
             Utils.Debug($"{DebugTag} created handle={Handle.DangerousGetHandle()} expectedRate={_expectedSampleRate} expectedChannels={_expectedChannels} sourceType={_sourceType}");
         }
 
+        // Reads Unity's actual output audio configuration. The capture path delivers buffers at the
+        // DSP output rate/channel count (see AudioProbe), so this is the format the native source
+        // must match. Falls back to the platform defaults when Unity cannot report a configuration
+        // (e.g. batch mode without an audio device).
+        private (uint sampleRate, uint channels) ResolveDeviceFormat()
+        {
+            uint sampleRate = _sourceType == RtcAudioSourceType.AudioSourceMicrophone
+                ? DefaultMicrophoneSampleRate
+                : DefaultSampleRate;
+            uint channels = DefaultChannels;
+
+            try
+            {
+                var config = UnityEngine.AudioSettings.GetConfiguration();
+                if (config.sampleRate > 0)
+                    sampleRate = (uint)config.sampleRate;
+                var configuredChannels = SpeakerModeChannels(config.speakerMode);
+                if (configuredChannels > 0)
+                    channels = configuredChannels;
+            }
+            catch (Exception e)
+            {
+                Utils.Warning($"{DebugTag} could not read Unity audio configuration, using defaults: {e.Message}");
+            }
+
+            return (sampleRate, channels);
+        }
+
+        private static uint SpeakerModeChannels(UnityEngine.AudioSpeakerMode mode)
+        {
+            switch (mode)
+            {
+                case UnityEngine.AudioSpeakerMode.Mono: return 1;
+                case UnityEngine.AudioSpeakerMode.Stereo: return 2;
+                case UnityEngine.AudioSpeakerMode.Quad: return 4;
+                case UnityEngine.AudioSpeakerMode.Surround: return 5;
+                case UnityEngine.AudioSpeakerMode.Mode5point1: return 6;
+                case UnityEngine.AudioSpeakerMode.Mode7point1: return 8;
+                case UnityEngine.AudioSpeakerMode.Prologic: return 2;
+                default: return 0;
+            }
+        }
+
         /// <summary>
         /// Begin capturing audio samples from the underlying source.
         /// </summary>
@@ -153,9 +209,16 @@ private void OnAudioRead(float[] data, int channels, int sampleRate)
                 return;
             }
 
+            // The native source rejects frames whose rate/channels differ from how it was
+            // configured (it does not resample). This should not happen now that the source is
+            // configured from the device, but if Unity reports an inconsistent format — or the
+            // output configuration changes at runtime — we drop the frame instead of sending a
+            // mismatch the native side would error on.
             if ((uint)sampleRate != _expectedSampleRate || (uint)channels != _expectedChannels)
             {
-                Utils.Warning($"{DebugTag} audio frame #{frameIndex} metadata mismatch actualRate={sampleRate} actualChannels={channels} expectedRate={_expectedSampleRate} expectedChannels={_expectedChannels} sourceType={_sourceType}");
+                if (frameIndex == 1 || frameIndex % 100 == 0)
+                    Utils.Warning($"{DebugTag} dropping audio frame #{frameIndex}: format {sampleRate}/{channels} does not match source {_expectedSampleRate}/{_expectedChannels} (sourceType={_sourceType})");
+                return;
             }
 
             var pendingBeforeSend = PendingFrameCount();
diff --git a/Samples~/Meet/Assets/Runtime/MeetManager.cs b/Samples~/Meet/Assets/Runtime/MeetManager.cs
index 225c7a0c..97b2cb70 100644
--- a/Samples~/Meet/Assets/Runtime/MeetManager.cs
+++ b/Samples~/Meet/Assets/Runtime/MeetManager.cs
@@ -453,8 +453,7 @@ private IEnumerator PublishLocalMicrophone()
     {
         if (_audioObjects.ContainsKey(LocalAudioTrackName)) yield break;
 
-        Microphone.Start(null, true, 10, 44100);
-
+        // MicrophoneSource starts the device itself, so we only need the device name here.
         var audioObject = new GameObject($"My Microphone: {Microphone.devices[0]}");
         audioObject.transform.SetParent(_audioTrackParent);
 
diff --git a/Tests/PlayMode/Utils/SineWaveAudioSource.cs b/Tests/PlayMode/Utils/SineWaveAudioSource.cs
index 907e9ccc..2337615b 100644
--- a/Tests/PlayMode/Utils/SineWaveAudioSource.cs
+++ b/Tests/PlayMode/Utils/SineWaveAudioSource.cs
@@ -31,7 +31,7 @@ public SineWaveAudioSource(
             int sampleRate = 48000,
             double frequencyHz = 440.0,
             float amplitude = 0.1f)
-            : base(channels, RtcAudioSourceType.AudioSourceCustom)
+            : base(RtcAudioSourceType.AudioSourceCustom, (uint)sampleRate, (uint)channels)
         {
             _channels = channels;
             _sampleRate = sampleRate;

From 834f2047ccf7b8a0a848b8b0518931462256ad4c Mon Sep 17 00:00:00 2001
From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com>
Date: Fri, 12 Jun 2026 10:12:35 +0200
Subject: [PATCH 2/7] Add throttled capture/receive rate diagnostics (Info
 level)

Temporary, ~2s-throttled diagnostics to investigate choppy received audio:

- RtcAudioSource logs the effective capture sample rate (samples/sec by
  wall clock) vs the rate declared to the native source. A measured rate
  that differs from the declared rate means the frame format label is
  wrong, which would sound fast/slow/choppy on the receiver.
- AudioStream logs buffer fill, underrun count, callback count and frames
  received, to distinguish receive-side starvation from a clean stream.

Emitted via Utils.Info so they appear without LK_DEBUG (Utils.Debug is
compiled out unless LK_DEBUG is defined).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Runtime/Scripts/AudioStream.cs    | 31 ++++++++++++++++++++++++++++
 Runtime/Scripts/RtcAudioSource.cs | 34 +++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/Runtime/Scripts/AudioStream.cs b/Runtime/Scripts/AudioStream.cs
index 4d9ab588..3288f839 100644
--- a/Runtime/Scripts/AudioStream.cs
+++ b/Runtime/Scripts/AudioStream.cs
@@ -50,6 +50,14 @@ public sealed class AudioStream : IDisposable
         private const int CrossfadeFrames = 128;  // ~2.7ms @ 48kHz
         private int _skipCooldown = 0;
 
+        // --- Temporary receive diagnostics (Info level, emitted ~every 2s) ---
+        // Reveals whether choppiness is a buffer-starvation problem (underruns/low fill) versus a
+        // clean stream, and what rate/channels we are actually playing/requesting.
+        private long _diagWindowStartTicks;
+        private int _diagCallbacks;
+        private int _diagUnderruns;
+        private int _diagFramesReceived;
+
         /// <summary>
         /// Creates a new audio stream from a remote audio track, attaching it to the
         /// given <see cref="AudioSource"/> in the scene.
@@ -147,6 +155,8 @@ private void OnAudioRead(float[] data, int channels, int sampleRate)
 
             lock (_lock)
             {
+                MaybeLogReceiveDiagnostics(channels, sampleRate);
+
                 // Single gate covering first-create and runtime format changes (e.g. after a
                 // system audio device switch). When the FFI stream is missing or what we asked
                 // Rust for no longer matches what Unity is delivering, post a (re)create to the
@@ -214,6 +224,7 @@ static float S16ToFloat(short v)
                 if (valuesAvailableToRead < data.Length)
                 {
                     _isPrimed = false;
+                    _diagUnderruns++;
                     Utils.Debug($"AudioStream underrun detected, re-priming (got {valuesAvailableToRead} samples but want to read {data.Length})");
 
                     // Output silence immediately instead of playing partial/choppy samples.
@@ -370,6 +381,7 @@ private void OnAudioStreamEvent(AudioStreamEvent e)
                     var data = new ReadOnlySpan<byte>(frame.Data.ToPointer(), frame.Length);
                     _buffer.Write(data);
                 }
+                _diagFramesReceived++;
             }
         }
 
@@ -427,6 +439,25 @@ private void Dispose(bool disposing)
             Dispose(false);
         }
 
+        // Temporary diagnostic: ~every 2s logs buffer fill, underrun count, callback count and
+        // frames received so we can tell starvation (choppy) from a clean stream. Called under _lock.
+        private void MaybeLogReceiveDiagnostics(int channels, int sampleRate)
+        {
+            _diagCallbacks++;
+            var now = System.Diagnostics.Stopwatch.GetTimestamp();
+            if (_diagWindowStartTicks == 0) _diagWindowStartTicks = now;
+            var elapsed = (now - _diagWindowStartTicks) / (double)System.Diagnostics.Stopwatch.Frequency;
+            if (elapsed < 2.0) return;
+
+            float fill = _buffer != null ? _buffer.AvailableReadInPercent() : 0f;
+            Utils.Info($"AudioStream#{_trackHandleId} diag: out={sampleRate}Hz/{channels}ch ffi={_ffiSampleRate}Hz/{_ffiNumChannels}ch " +
+                       $"bufferFill={fill * 100f:F0}% callbacks={_diagCallbacks} underruns={_diagUnderruns} framesRecv={_diagFramesReceived} over={elapsed:F1}s");
+            _diagWindowStartTicks = now;
+            _diagCallbacks = 0;
+            _diagUnderruns = 0;
+            _diagFramesReceived = 0;
+        }
+
         // For testing and debugging
         internal float GetBufferFill()
         {
diff --git a/Runtime/Scripts/RtcAudioSource.cs b/Runtime/Scripts/RtcAudioSource.cs
index 7f5c3d7c..c85e5a2d 100644
--- a/Runtime/Scripts/RtcAudioSource.cs
+++ b/Runtime/Scripts/RtcAudioSource.cs
@@ -83,6 +83,15 @@ private sealed class PendingAudioFrame
         private volatile bool _disposed = false;
         private int _audioReadCount = 0;
 
+        // --- Temporary capture-rate diagnostics (Info level, emitted ~every 2s) ---
+        // Measures the effective sample rate from wall-clock time vs the rate we declared to the
+        // native source. A measured rate that differs from the declared rate means the format
+        // label on the frames is wrong (audio would sound fast/slow/choppy on the receiver).
+        private long _diagWindowStartTicks;     // 0 = not started
+        private long _diagSamplesPerChannel;
+        private int _diagAcceptedFrames;
+        private int _diagDroppedFrames;
+
         // Device-capture sources (microphone, AudioSource taps) don't know their format ahead of
         // time — it is whatever Unity's audio graph delivers. They use this constructor, which
         // configures the native source from Unity's current output configuration.
@@ -209,6 +218,9 @@ private void OnAudioRead(float[] data, int channels, int sampleRate)
                 return;
             }
 
+            var willDrop = (uint)sampleRate != _expectedSampleRate || (uint)channels != _expectedChannels;
+            RecordCaptureDiagnostics(data.Length / channels, channels, sampleRate, willDrop);
+
             // The native source rejects frames whose rate/channels differ from how it was
             // configured (it does not resample). This should not happen now that the source is
             // configured from the device, but if Unity reports an inconsistent format — or the
@@ -405,6 +417,28 @@ private static double ElapsedMilliseconds(long startedTimestamp)
             return (Stopwatch.GetTimestamp() - startedTimestamp) * 1000.0 / Stopwatch.Frequency;
         }
 
+        // Temporary diagnostic: accumulates captured audio over wall-clock time and, ~every 2s,
+        // logs the effective sample rate vs the rate declared to the native source. Runs on the
+        // audio thread; the periodic Info log is cheap.
+        private void RecordCaptureDiagnostics(int samplesPerChannel, int channels, int sampleRate, bool dropped)
+        {
+            var now = Stopwatch.GetTimestamp();
+            if (_diagWindowStartTicks == 0) _diagWindowStartTicks = now;
+            _diagSamplesPerChannel += samplesPerChannel;
+            if (dropped) _diagDroppedFrames++; else _diagAcceptedFrames++;
+
+            var elapsed = (now - _diagWindowStartTicks) / (double)Stopwatch.Frequency;
+            if (elapsed < 2.0) return;
+
+            var measuredRate = _diagSamplesPerChannel / elapsed;
+            Utils.Info($"{DebugTag} capture diag: declared={_expectedSampleRate}Hz/{_expectedChannels}ch measuredRate={measuredRate:F0}Hz " +
+                       $"lastFrame={samplesPerChannel}smp/{channels}ch/{sampleRate}Hz accepted={_diagAcceptedFrames} dropped={_diagDroppedFrames} over={elapsed:F1}s");
+            _diagWindowStartTicks = now;
+            _diagSamplesPerChannel = 0;
+            _diagAcceptedFrames = 0;
+            _diagDroppedFrames = 0;
+        }
+
         private string DebugTag => $"RtcAudioSource#{_debugId}";
     }
 }

From a775e593e899b0f9d6bee902ab7876138160f4bf Mon Sep 17 00:00:00 2001
From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:07:23 +0200
Subject: [PATCH 3/7] Open microphone at the output sample rate to avoid
 capture drift

MicrophoneSource started the device at the hardcoded DefaultMicrophoneSampleRate
and played the looping clip through an AudioSource read on the DSP thread. When
the device's actual rate differs from the engine output rate, the clip fills and
plays back at different rates, so the read position drifts against the write
position and the captured audio becomes choppy.

Open the microphone at AudioSettings.outputSampleRate when the device supports
it (clamped to the device's reported caps; falling back to the default when the
output rate is unknown), so capture and playback run at the same rate. This also
aligns the mic rate with the native source rate, which is taken from the same
output configuration.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Runtime/Scripts/MicrophoneSource.cs | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs
index a8775568..9a4405ac 100644
--- a/Runtime/Scripts/MicrophoneSource.cs
+++ b/Runtime/Scripts/MicrophoneSource.cs
@@ -59,6 +59,28 @@ public override void Start()
             _started = true;
         }
 
+        // Opens the microphone at the engine's output sample rate when the device supports it, so
+        // the captured clip and the AudioSource that plays it back run at the same rate. A mismatch
+        // makes the looping clip drift against the playback read position and produces choppy audio.
+        // Falls back to DefaultMicrophoneSampleRate when the output rate is unknown, and clamps to
+        // the device's supported range when it reports one.
+        private static int ResolveMicrophoneSampleRate(string deviceName)
+        {
+            int target = AudioSettings.outputSampleRate;
+            if (target <= 0)
+                target = (int)DefaultMicrophoneSampleRate;
+
+            Microphone.GetDeviceCaps(deviceName, out int minFreq, out int maxFreq);
+            // Unity reports (0, 0) when the device imposes no specific sample-rate range.
+            if (minFreq == 0 && maxFreq == 0)
+                return target;
+
+            var result = Mathf.Clamp(target, minFreq, maxFreq);
+            Utils.Info($"ResolveMicrophoneSampleRate: {result}");
+
+            return result;
+        }
+
         private IEnumerator StartMicrophone()
         {
             // Validate that the GameObject is still valid before starting
@@ -76,13 +98,14 @@ private IEnumerator StartMicrophone()
             }
 
             AudioClip clip = null;
+            var micFrequency = ResolveMicrophoneSampleRate(_deviceName);
             try
             {
                 clip = Microphone.Start(
                     _deviceName,
                     loop: true,
                     lengthSec: 1,
-                    frequency: (int)DefaultMicrophoneSampleRate
+                    frequency: micFrequency
                 );
             }
             catch (Exception e)
@@ -97,6 +120,8 @@ private IEnumerator StartMicrophone()
                 yield break;
             }
 
+            Utils.Info($"MicrophoneSource device='{_deviceName}' opened at {micFrequency}Hz (output={AudioSettings.outputSampleRate}Hz)");
+
             // Ensure no duplicate components exist before adding new ones.
             // This is important during app resume on iOS where components might not be
             // fully destroyed yet due to Unity's deferred Destroy().

From 4604f248b6f61c0e5a47008128fe5186510eda01 Mon Sep 17 00:00:00 2001
From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com>
Date: Fri, 12 Jun 2026 15:29:47 +0200
Subject: [PATCH 4/7] Add adaptive pitch servo to lock mic playback to the
 capture rate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The mic clip is filled by the capture device's clock while the AudioSource
that plays it (feeding OnAudioFilterRead) runs on the output device's clock.
Some devices also misreport the clip rate entirely: a Bluetooth headset on
macOS labels its clip 16kHz while filling it at ~51kHz. Either way the read
head drifts against the write head and gets lapped, which sounds like
periodic chopping.

Add a pacing servo that measures how fast the write head actually advances
(GetPosition delta over wall clock - steady within ±0.1% even when the
instantaneous position is jumpy) and continuously adjusts AudioSource.pitch
so the read head consumes clip samples at the same rate, holding a fixed lag
behind the writer. A short pre-roll measures the rate before playback starts
so the initial pitch is already correct; the fill-rate estimate and the lag
target (sized to ~4x observed jitter, bounded by clip capacity) keep adapting
while capturing, and an out-of-bounds resync recovers from long hitches.

In the normal case the measured rate matches clip.frequency, pitch hovers at
~1.0, and the servo is effectively a no-op. In the misreporting case pitch
settles at the true ratio (~3.2), which plays the clip's real-time data at
correct speed and eliminates the chop. Pitch is rate control, not a delay:
the added latency is only the held lag (~80-150ms, adaptive).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Runtime/Scripts/MicrophoneSource.cs | 131 +++++++++++++++++++++++++++-
 1 file changed, 130 insertions(+), 1 deletion(-)

diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs
index 9a4405ac..a8cd1c77 100644
--- a/Runtime/Scripts/MicrophoneSource.cs
+++ b/Runtime/Scripts/MicrophoneSource.cs
@@ -13,6 +13,23 @@ namespace LiveKit
     /// </remarks>
     sealed public class MicrophoneSource : RtcAudioSource
     {
+        // --- Playback pacing servo ---
+        // The mic clip is filled by the capture device's clock while the AudioSource that plays it
+        // (feeding AudioProbe/OnAudioFilterRead) is driven by the output device's clock. Worse,
+        // some devices misreport the clip rate entirely (a Bluetooth headset on macOS labeled its
+        // clip 16kHz while filling it at ~51kHz). Either way the read head drifts against the
+        // write head until it gets lapped, which sounds like periodic chopping. The servo measures
+        // how fast the write head actually advances and continuously adjusts AudioSource.pitch so
+        // the read head consumes clip samples at the same rate, holding a fixed lag behind the
+        // writer. In the normal case the measured rate matches clip.frequency and pitch stays ~1.
+        private const float PreRollSeconds = 0.3f;       // initial fill-rate measurement window
+        private const float MinTargetLagSec = 0.08f;     // smallest safety lag (good devices)
+        private const float MaxTargetLagSec = 0.25f;     // adaptive ceiling (jittery devices)
+        private const float PitchCorrectionGain = 0.5f;  // proportional gain on relative lag error
+        private const float MaxRelativeCorrection = 0.2f;
+        private const float MinPitch = 0.25f;
+        private const float MaxPitch = 8f;
+
         private readonly GameObject _sourceObject;
         private readonly string _deviceName;
 
@@ -160,10 +177,122 @@ private IEnumerator StartMicrophone()
                 yield break;
             }
 
-            source.Play();
+            // Playback is started by the pacing servo, which first measures the clip's true fill
+            // rate so the initial pitch and read position are right from the first sample.
+            MonoBehaviourContext.RunCoroutine(PaceMicrophonePlayback(source, clip));
             Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully");
         }
 
+        // Keeps the AudioSource's read head locked a fixed lag behind the mic's write head by
+        // adjusting pitch (see the servo comment at the top of the class). Pitch is rate control,
+        // not a delay: the only latency this adds is the target lag itself.
+        private IEnumerator PaceMicrophonePlayback(AudioSource source, AudioClip clip)
+        {
+            int clipFrames = clip.samples;
+            int declaredRate = clip.frequency;
+
+            // Pre-roll: measure the true fill rate before playback starts. GetPosition's
+            // instantaneous position can be jumpy on misbehaving devices, but its average advance
+            // has measured steady (±0.1%), so a short window gives a reliable rate.
+            int prevWrite = Microphone.GetPosition(_deviceName);
+            long writeAdvance = 0;
+            var preRoll = System.Diagnostics.Stopwatch.StartNew();
+            while (preRoll.Elapsed.TotalSeconds < PreRollSeconds)
+            {
+                if (!_started || _disposed || source == null || !Microphone.IsRecording(_deviceName)) yield break;
+                yield return null;
+                int w = Microphone.GetPosition(_deviceName);
+                writeAdvance += ((w - prevWrite) % clipFrames + clipFrames) % clipFrames;
+                prevWrite = w;
+            }
+            if (!_started || _disposed || source == null) yield break;
+
+            double fillRate = writeAdvance > 0 ? writeAdvance / preRoll.Elapsed.TotalSeconds : declaredRate;
+            double basePitch = fillRate / declaredRate;
+            // The lag target must stay well below the clip's real capacity (clipFrames at the
+            // true fill rate), which can be much shorter than lengthSec when the rate is misreported.
+            float capacityCapSec = (float)(0.4 * clipFrames / fillRate);
+            float targetLagSec = Mathf.Min(MinTargetLagSec, capacityCapSec);
+
+            source.pitch = Mathf.Clamp((float)basePitch, MinPitch, MaxPitch);
+            source.Play();
+            long targetLag = (long)(targetLagSec * fillRate);
+            int startRead = (int)(((prevWrite - targetLag) % clipFrames + clipFrames) % clipFrames);
+            source.timeSamples = startRead;
+
+            Utils.Info($"MicrophoneSource pacing: measured={fillRate:F0}Hz declared={declaredRate}Hz pitch={source.pitch:F2} lag={targetLagSec * 1000:F0}ms");
+
+            int prevRead = startRead;
+            double lag = targetLag;          // current read-behind-write distance, in clip samples
+            double smoothedLag = lag;
+            double jitter = 0;
+            long rateAdvance = 0;
+            var rateWindow = System.Diagnostics.Stopwatch.StartNew();
+            var statusWindow = System.Diagnostics.Stopwatch.StartNew();
+
+            while (_started && !_disposed && source != null && Microphone.IsRecording(_deviceName))
+            {
+                yield return null;
+                if (source == null) yield break;
+
+                int w = Microphone.GetPosition(_deviceName);
+                int r = source.timeSamples;
+                // Unwrapped per-frame advances. A hitch longer than the clip aliases these; the
+                // resync guard below recovers from the resulting inconsistency.
+                long dw = ((w - prevWrite) % clipFrames + clipFrames) % clipFrames;
+                long dr = ((r - prevRead) % clipFrames + clipFrames) % clipFrames;
+                prevWrite = w;
+                prevRead = r;
+                lag += dw - dr;
+                rateAdvance += dw;
+
+                smoothedLag = 0.95 * smoothedLag + 0.05 * lag;
+                jitter = 0.95 * jitter + 0.05 * Math.Abs(lag - smoothedLag);
+
+                // Refine the fill rate and adapt the lag target once per second.
+                if (rateWindow.Elapsed.TotalSeconds >= 1.0)
+                {
+                    double instRate = rateAdvance / rateWindow.Elapsed.TotalSeconds;
+                    if (instRate > 0)
+                    {
+                        fillRate = 0.7 * fillRate + 0.3 * instRate;
+                        basePitch = fillRate / declaredRate;
+                    }
+                    rateAdvance = 0;
+                    rateWindow.Restart();
+
+                    // Hold ~4x the observed jitter as safety margin, within bounds and capacity.
+                    float jitterSec = (float)(jitter / fillRate);
+                    capacityCapSec = (float)(0.4 * clipFrames / fillRate);
+                    targetLagSec = Mathf.Min(Mathf.Clamp(jitterSec * 4f, MinTargetLagSec, MaxTargetLagSec), capacityCapSec);
+                }
+
+                // Proportional pitch correction toward the target lag.
+                double target = targetLagSec * fillRate;
+                double relErr = (smoothedLag - target) / target;
+                relErr = Math.Max(-MaxRelativeCorrection, Math.Min(MaxRelativeCorrection, relErr));
+                source.pitch = Mathf.Clamp((float)(basePitch * (1.0 + PitchCorrectionGain * relErr)), MinPitch, MaxPitch);
+
+                // Out of bounds (reader overran the writer, or fell so far behind it reads
+                // overwritten data): jump back to the target lag. Audible once, then stable.
+                if (lag < 0 || lag > clipFrames * 0.9)
+                {
+                    int resyncRead = (int)(((w - (long)target) % clipFrames + clipFrames) % clipFrames);
+                    source.timeSamples = resyncRead;
+                    prevRead = resyncRead;
+                    lag = target;
+                    smoothedLag = target;
+                    Utils.Warning($"MicrophoneSource pacing: resync, lag reset to {targetLagSec * 1000:F0}ms (rate={fillRate:F0}Hz pitch={source.pitch:F2})");
+                }
+
+                if (statusWindow.Elapsed.TotalSeconds >= 5.0)
+                {
+                    Utils.Info($"MicrophoneSource pacing: rate={fillRate:F0}Hz pitch={source.pitch:F2} lag={smoothedLag / fillRate * 1000:F0}ms target={targetLagSec * 1000:F0}ms jitter={jitter / fillRate * 1000:F1}ms");
+                    statusWindow.Restart();
+                }
+            }
+        }
+
         /// <summary>
         /// Stops capturing audio from the microphone.
         /// </summary>

From 99e35ea3a1ed6402951aea1928337d0d23df2ff8 Mon Sep 17 00:00:00 2001
From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com>
Date: Fri, 12 Jun 2026 15:39:22 +0200
Subject: [PATCH 5/7] Pin pitch to ~1; use GetPosition only rescaled to
 estimate the write head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Field test falsified the previous model: with pitch set to the measured
counter ratio (3.2), the published audio became garbled repeats ("noise with
echo"), while the servo's own lag telemetry stayed perfectly stable — because
it was measuring against the same lying counter. Combined with earlier
results (1x playback yields correct-pitch voice; reading at the counter's
pace yields noise), the consistent model is:

- The clip DATA genuinely is at clip.frequency (16kHz here).
- Microphone.GetPosition's counter is inflated ~3.2x on macOS + BT-HFP; it
  does not describe the data. The choppiness on the plain path is the read
  head colliding with the bursty real write head due to a small, unmanaged
  startup lag — not a rate mismatch.

Rework the servo accordingly: pitch stays pinned near 1.0 (max ±3% trim).
The counter is used only after rescaling by its measured inflation factor
k = counterRate / clip.frequency (~1 on healthy devices) to estimate the
real write head, and the servo holds the read head a generous adaptive lag
(150ms default) behind that estimate. Clip buffer extended to 2s for more
collision headroom.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Runtime/Scripts/MicrophoneSource.cs | 129 +++++++++++++++-------------
 1 file changed, 70 insertions(+), 59 deletions(-)

diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs
index a8cd1c77..1d9a8e6a 100644
--- a/Runtime/Scripts/MicrophoneSource.cs
+++ b/Runtime/Scripts/MicrophoneSource.cs
@@ -15,20 +15,24 @@ sealed public class MicrophoneSource : RtcAudioSource
     {
         // --- Playback pacing servo ---
         // The mic clip is filled by the capture device's clock while the AudioSource that plays it
-        // (feeding AudioProbe/OnAudioFilterRead) is driven by the output device's clock. Worse,
-        // some devices misreport the clip rate entirely (a Bluetooth headset on macOS labeled its
-        // clip 16kHz while filling it at ~51kHz). Either way the read head drifts against the
-        // write head until it gets lapped, which sounds like periodic chopping. The servo measures
-        // how fast the write head actually advances and continuously adjusts AudioSource.pitch so
-        // the read head consumes clip samples at the same rate, holding a fixed lag behind the
-        // writer. In the normal case the measured rate matches clip.frequency and pitch stays ~1.
-        private const float PreRollSeconds = 0.3f;       // initial fill-rate measurement window
-        private const float MinTargetLagSec = 0.08f;     // smallest safety lag (good devices)
-        private const float MaxTargetLagSec = 0.25f;     // adaptive ceiling (jittery devices)
-        private const float PitchCorrectionGain = 0.5f;  // proportional gain on relative lag error
-        private const float MaxRelativeCorrection = 0.2f;
-        private const float MinPitch = 0.25f;
-        private const float MaxPitch = 8f;
+        // (feeding AudioProbe/OnAudioFilterRead) is driven by the output device's clock. With a
+        // small, unmanaged startup lag the read head collides with the (bursty) write head and the
+        // captured audio chops.
+        //
+        // Microphone.GetPosition cannot be trusted directly: on macOS with a Bluetooth HFP headset
+        // its counter advances ~3.2x faster than data is actually written (clip labeled 16kHz,
+        // counter at ~51k/s). The clip DATA is still genuinely at clip.frequency — playing it at
+        // 1x yields correct-pitch voice, while reading at the counter's pace yields garbled
+        // repeats. So the servo keeps pitch at ~1.0 and uses the counter only after rescaling by
+        // its measured inflation factor k (counterRate / clip.frequency, ~1 on healthy devices) to
+        // estimate the real write head, holding the read head a generous lag behind it with only
+        // tiny pitch trims.
+        private const float PreRollSeconds = 0.3f;        // counter-rate measurement window
+        private const float DefaultTargetLagSec = 0.15f;  // initial read-behind-write lag
+        private const float MinTargetLagSec = 0.10f;
+        private const float MaxTargetLagSec = 0.40f;      // adaptive ceiling (jittery devices)
+        private const float TrimGain = 0.5f;              // proportional gain on relative lag error
+        private const float MaxPitchTrim = 0.03f;         // pitch stays within [0.97, 1.03]
 
         private readonly GameObject _sourceObject;
         private readonly string _deviceName;
@@ -121,7 +125,7 @@ private IEnumerator StartMicrophone()
                 clip = Microphone.Start(
                     _deviceName,
                     loop: true,
-                    lengthSec: 1,
+                    lengthSec: 2,
                     frequency: micFrequency
                 );
             }
@@ -183,50 +187,56 @@ private IEnumerator StartMicrophone()
             Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully");
         }
 
-        // Keeps the AudioSource's read head locked a fixed lag behind the mic's write head by
-        // adjusting pitch (see the servo comment at the top of the class). Pitch is rate control,
-        // not a delay: the only latency this adds is the target lag itself.
+        // Keeps the AudioSource's read head a fixed lag behind the (estimated) real write head
+        // (see the servo comment at the top of the class). Pitch stays ~1.0 — the clip data rate
+        // IS clip.frequency — with only tiny trims; the added latency is the held lag itself.
         private IEnumerator PaceMicrophonePlayback(AudioSource source, AudioClip clip)
         {
             int clipFrames = clip.samples;
             int declaredRate = clip.frequency;
 
-            // Pre-roll: measure the true fill rate before playback starts. GetPosition's
-            // instantaneous position can be jumpy on misbehaving devices, but its average advance
-            // has measured steady (±0.1%), so a short window gives a reliable rate.
-            int prevWrite = Microphone.GetPosition(_deviceName);
-            long writeAdvance = 0;
+            // Pre-roll: measure how fast GetPosition's counter advances. Its instantaneous value
+            // can be jumpy, but its average advance is steady (±0.1% measured), so a short window
+            // gives a reliable rate. k is the counter's inflation relative to the data rate.
+            int prevCounter = Microphone.GetPosition(_deviceName);
+            long counterUnwrapped = prevCounter; // counter ran since Microphone.Start; small so far
+            long preRollStart = counterUnwrapped;
             var preRoll = System.Diagnostics.Stopwatch.StartNew();
             while (preRoll.Elapsed.TotalSeconds < PreRollSeconds)
             {
                 if (!_started || _disposed || source == null || !Microphone.IsRecording(_deviceName)) yield break;
                 yield return null;
-                int w = Microphone.GetPosition(_deviceName);
-                writeAdvance += ((w - prevWrite) % clipFrames + clipFrames) % clipFrames;
-                prevWrite = w;
+                int c = Microphone.GetPosition(_deviceName);
+                counterUnwrapped += ((c - prevCounter) % clipFrames + clipFrames) % clipFrames;
+                prevCounter = c;
             }
             if (!_started || _disposed || source == null) yield break;
 
-            double fillRate = writeAdvance > 0 ? writeAdvance / preRoll.Elapsed.TotalSeconds : declaredRate;
-            double basePitch = fillRate / declaredRate;
-            // The lag target must stay well below the clip's real capacity (clipFrames at the
-            // true fill rate), which can be much shorter than lengthSec when the rate is misreported.
-            float capacityCapSec = (float)(0.4 * clipFrames / fillRate);
-            float targetLagSec = Mathf.Min(MinTargetLagSec, capacityCapSec);
+            double counterRate = (counterUnwrapped - preRollStart) / preRoll.Elapsed.TotalSeconds;
+            if (counterRate <= 0) counterRate = declaredRate;
+            double k = counterRate / declaredRate; // ~1 on healthy devices, ~3.2 on macOS BT-HFP
 
-            source.pitch = Mathf.Clamp((float)basePitch, MinPitch, MaxPitch);
-            source.Play();
-            long targetLag = (long)(targetLagSec * fillRate);
-            int startRead = (int)(((prevWrite - targetLag) % clipFrames + clipFrames) % clipFrames);
-            source.timeSamples = startRead;
+            // Lag target, bounded by the clip's data capacity (clipFrames samples).
+            float capacityCapSec = 0.4f * clipFrames / declaredRate;
+            float targetLagSec = Mathf.Min(DefaultTargetLagSec, capacityCapSec);
+            double target = targetLagSec * declaredRate;
 
-            Utils.Info($"MicrophoneSource pacing: measured={fillRate:F0}Hz declared={declaredRate}Hz pitch={source.pitch:F2} lag={targetLagSec * 1000:F0}ms");
+            // Estimated real write head in data samples: the counter rescaled by k (both started
+            // at zero when capture began).
+            double writeEst = counterUnwrapped / k;
 
+            source.pitch = 1f;
+            source.Play();
+            int startRead = (int)((((long)(writeEst - target)) % clipFrames + clipFrames) % clipFrames);
+            source.timeSamples = startRead;
             int prevRead = startRead;
-            double lag = targetLag;          // current read-behind-write distance, in clip samples
+            double lag = target; // data samples the reader trails the estimated writer
             double smoothedLag = lag;
             double jitter = 0;
-            long rateAdvance = 0;
+
+            Utils.Info($"MicrophoneSource pacing: counter={counterRate:F0}/s k={k:F2} dataRate={declaredRate}Hz lag={targetLagSec * 1000:F0}ms");
+
+            long counterWindow = 0;
             var rateWindow = System.Diagnostics.Stopwatch.StartNew();
             var statusWindow = System.Diagnostics.Stopwatch.StartNew();
 
@@ -235,59 +245,60 @@ private IEnumerator PaceMicrophonePlayback(AudioSource source, AudioClip clip)
                 yield return null;
                 if (source == null) yield break;
 
-                int w = Microphone.GetPosition(_deviceName);
+                int c = Microphone.GetPosition(_deviceName);
                 int r = source.timeSamples;
                 // Unwrapped per-frame advances. A hitch longer than the clip aliases these; the
                 // resync guard below recovers from the resulting inconsistency.
-                long dw = ((w - prevWrite) % clipFrames + clipFrames) % clipFrames;
+                long dc = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames;
                 long dr = ((r - prevRead) % clipFrames + clipFrames) % clipFrames;
-                prevWrite = w;
+                prevCounter = c;
                 prevRead = r;
-                lag += dw - dr;
-                rateAdvance += dw;
+                counterUnwrapped += dc;
+                counterWindow += dc;
+                lag += dc / k - dr;
 
                 smoothedLag = 0.95 * smoothedLag + 0.05 * lag;
                 jitter = 0.95 * jitter + 0.05 * Math.Abs(lag - smoothedLag);
 
-                // Refine the fill rate and adapt the lag target once per second.
+                // Refine the counter rate and adapt the lag target once per second.
                 if (rateWindow.Elapsed.TotalSeconds >= 1.0)
                 {
-                    double instRate = rateAdvance / rateWindow.Elapsed.TotalSeconds;
+                    double instRate = counterWindow / rateWindow.Elapsed.TotalSeconds;
                     if (instRate > 0)
                     {
-                        fillRate = 0.7 * fillRate + 0.3 * instRate;
-                        basePitch = fillRate / declaredRate;
+                        counterRate = 0.7 * counterRate + 0.3 * instRate;
+                        k = counterRate / declaredRate;
                     }
-                    rateAdvance = 0;
+                    counterWindow = 0;
                     rateWindow.Restart();
 
                     // Hold ~4x the observed jitter as safety margin, within bounds and capacity.
-                    float jitterSec = (float)(jitter / fillRate);
-                    capacityCapSec = (float)(0.4 * clipFrames / fillRate);
+                    float jitterSec = (float)(jitter / declaredRate);
                     targetLagSec = Mathf.Min(Mathf.Clamp(jitterSec * 4f, MinTargetLagSec, MaxTargetLagSec), capacityCapSec);
+                    target = targetLagSec * declaredRate;
                 }
 
-                // Proportional pitch correction toward the target lag.
-                double target = targetLagSec * fillRate;
+                // Tiny proportional pitch trim toward the target lag. The data rate is
+                // clip.frequency, so pitch must stay pinned near 1.
                 double relErr = (smoothedLag - target) / target;
-                relErr = Math.Max(-MaxRelativeCorrection, Math.Min(MaxRelativeCorrection, relErr));
-                source.pitch = Mathf.Clamp((float)(basePitch * (1.0 + PitchCorrectionGain * relErr)), MinPitch, MaxPitch);
+                relErr = Math.Max(-1.0, Math.Min(1.0, relErr));
+                source.pitch = 1f + Mathf.Clamp((float)(TrimGain * relErr) * MaxPitchTrim, -MaxPitchTrim, MaxPitchTrim);
 
                 // Out of bounds (reader overran the writer, or fell so far behind it reads
                 // overwritten data): jump back to the target lag. Audible once, then stable.
                 if (lag < 0 || lag > clipFrames * 0.9)
                 {
-                    int resyncRead = (int)(((w - (long)target) % clipFrames + clipFrames) % clipFrames);
+                    int resyncRead = (int)((((long)(counterUnwrapped / k - target)) % clipFrames + clipFrames) % clipFrames);
                     source.timeSamples = resyncRead;
                     prevRead = resyncRead;
                     lag = target;
                     smoothedLag = target;
-                    Utils.Warning($"MicrophoneSource pacing: resync, lag reset to {targetLagSec * 1000:F0}ms (rate={fillRate:F0}Hz pitch={source.pitch:F2})");
+                    Utils.Warning($"MicrophoneSource pacing: resync, lag reset to {targetLagSec * 1000:F0}ms (k={k:F2} pitch={source.pitch:F3})");
                 }
 
                 if (statusWindow.Elapsed.TotalSeconds >= 5.0)
                 {
-                    Utils.Info($"MicrophoneSource pacing: rate={fillRate:F0}Hz pitch={source.pitch:F2} lag={smoothedLag / fillRate * 1000:F0}ms target={targetLagSec * 1000:F0}ms jitter={jitter / fillRate * 1000:F1}ms");
+                    Utils.Info($"MicrophoneSource pacing: k={k:F2} pitch={source.pitch:F3} lag={smoothedLag / declaredRate * 1000:F0}ms target={targetLagSec * 1000:F0}ms jitter={jitter / declaredRate * 1000:F1}ms");
                     statusWindow.Restart();
                 }
             }

From e17fd2bbd75309728d9c84df35441b79a29317aa Mon Sep 17 00:00:00 2001
From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com>
Date: Fri, 12 Jun 2026 15:46:07 +0200
Subject: [PATCH 6/7] Add editor-only mic clip WAV dump (temp diagnostic)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reworked servo's telemetry is perfect in the bad state (k=3.20,
pitch~1.00, lag locked on target, jitter ~0, no resyncs) yet the published
audio still chops like the unpaced path. That falsifies the read/write
collision model: the reader is provably never near the writer.

Remaining hypothesis: the chop is baked into the clip data itself — FMOD
scatters the real 16kHz samples at the inflated counter's positions, leaving
stale regions between fragments (~31% fresh per cycle). That would also
explain why counter-paced reading sounds like noise with echo (fragments +
stale older audio, fast).

Snapshot the raw clip to a WAV 4s after capture starts (editor-only) so the
buffer contents can be inspected directly: contiguous voice means the chop
is downstream and still fixable; fragmented voice means capture data is
destroyed at write time and the Unity Microphone path cannot work for this
device.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Runtime/Scripts/MicrophoneSource.cs | 47 +++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs
index 1d9a8e6a..16ddea69 100644
--- a/Runtime/Scripts/MicrophoneSource.cs
+++ b/Runtime/Scripts/MicrophoneSource.cs
@@ -184,9 +184,56 @@ private IEnumerator StartMicrophone()
             // Playback is started by the pacing servo, which first measures the clip's true fill
             // rate so the initial pitch and read position are right from the first sample.
             MonoBehaviourContext.RunCoroutine(PaceMicrophonePlayback(source, clip));
+#if UNITY_EDITOR
+            MonoBehaviourContext.RunCoroutine(DumpClipOnce(clip));
+#endif
             Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully");
         }
 
+#if UNITY_EDITOR
+        // TEMP diagnostic: snapshots the raw mic clip to a WAV so its contents can be inspected
+        // offline — is it one contiguous audio stream, or voice fragments scattered between stale
+        // regions? Speak continuously for the first ~5 seconds of capture. Editor-only.
+        private IEnumerator DumpClipOnce(AudioClip clip)
+        {
+            yield return new WaitForSeconds(4f);
+            if (_disposed || clip == null) yield break;
+            try
+            {
+                var data = new float[clip.samples * clip.channels];
+                clip.GetData(data, 0);
+                var path = System.IO.Path.Combine(System.IO.Path.GetTempPath(), "lk_mic_clip.wav");
+                WriteWav(path, data, clip.channels, clip.frequency);
+                Utils.Info($"MicrophoneSource: dumped clip snapshot to {path} ({clip.samples} frames @ {clip.frequency}Hz/{clip.channels}ch)");
+            }
+            catch (Exception e)
+            {
+                Utils.Warning($"MicrophoneSource: clip dump failed: {e.Message}");
+            }
+        }
+
+        private static void WriteWav(string path, float[] samples, int channels, int sampleRate)
+        {
+            using var fs = new System.IO.FileStream(path, System.IO.FileMode.Create);
+            using var w = new System.IO.BinaryWriter(fs);
+            int dataBytes = samples.Length * 2;
+            w.Write(System.Text.Encoding.ASCII.GetBytes("RIFF"));
+            w.Write(36 + dataBytes);
+            w.Write(System.Text.Encoding.ASCII.GetBytes("WAVEfmt "));
+            w.Write(16);
+            w.Write((short)1);              // PCM
+            w.Write((short)channels);
+            w.Write(sampleRate);
+            w.Write(sampleRate * channels * 2);
+            w.Write((short)(channels * 2)); // block align
+            w.Write((short)16);             // bits per sample
+            w.Write(System.Text.Encoding.ASCII.GetBytes("data"));
+            w.Write(dataBytes);
+            foreach (var s in samples)
+                w.Write((short)(Mathf.Clamp(s, -1f, 1f) * 32767f));
+        }
+#endif
+
         // Keeps the AudioSource's read head a fixed lag behind the (estimated) real write head
         // (see the servo comment at the top of the class). Pitch stays ~1.0 — the clip data rate
         // IS clip.frequency — with only tiny trims; the added latency is the held lag itself.

From 8ff252ba164642729138a5b53a24ad122f87fa03 Mon Sep 17 00:00:00 2001
From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com>
Date: Fri, 12 Jun 2026 16:00:54 +0200
Subject: [PATCH 7/7] Reconstruct fragmented mic clip by reading valid samples
 per stride

A raw dump of the mic clip in the macOS + Bluetooth HFP state revealed the
true buffer structure: FMOD writes each real 20ms packet of clip.frequency
audio, then advances the position counter as if it had written k (~3.2x) as
much and zero-fills the skipped range. The buffer holds valid fragments of
exactly 320 samples at a stride of exactly 1024 (320/1024 = 1/k), and the
fragments join continuously (junction sample deltas within normal in-fragment
variation) - i.e. the full audio stream is present, just zero-padded.
Concatenating the fragments reconstructed clean, correct-pitch voice
(verified by ear), which also explains every earlier symptom: plain playback
= 31% voice + 69% silence (chop); counter-paced reading = fragments and
padding played fast over a live buffer (noise with echo).

Replace the pitch-servo playback approach with fragment-aware direct capture:

- Read the clip ring buffer directly (no AudioSource, no OnAudioFilterRead),
  which also decouples capture from the output device's clock.
- Pre-roll measures the counter rate (k = counterRate / clip.frequency) and
  the counter's smallest discrete jump (the stride J).
- k ~ 1: plain contiguous read at the counter's pace (healthy devices).
- k > 1.05: read only the first J/k samples of each stride - exactly the
  valid fragments - skipping the zero padding.
- Downmix to mono and resample clip.frequency -> 48kHz (streaming linear;
  state carries across fragments since their junctions are continuous), into
  a native source fixed at 48kHz mono.
- Backlog beyond 200ms after a stall is dropped, stride-aligned, to avoid
  overrunning the native queue.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Runtime/Scripts/MicrophoneSource.cs | 392 +++++++++++-----------------
 1 file changed, 157 insertions(+), 235 deletions(-)

diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs
index 16ddea69..1a1823f0 100644
--- a/Runtime/Scripts/MicrophoneSource.cs
+++ b/Runtime/Scripts/MicrophoneSource.cs
@@ -1,5 +1,6 @@
 using System;
 using System.Collections;
+using System.Collections.Generic;
 using UnityEngine;
 using LiveKit.Internal;
 
@@ -13,46 +14,59 @@ namespace LiveKit
     /// </remarks>
     sealed public class MicrophoneSource : RtcAudioSource
     {
-        // --- Playback pacing servo ---
-        // The mic clip is filled by the capture device's clock while the AudioSource that plays it
-        // (feeding AudioProbe/OnAudioFilterRead) is driven by the output device's clock. With a
-        // small, unmanaged startup lag the read head collides with the (bursty) write head and the
-        // captured audio chops.
+        // --- Capture design ---
+        // The microphone clip's ring buffer is read directly (no AudioSource playback, no
+        // OnAudioFilterRead), so capture is decoupled from the output device's clock.
         //
-        // Microphone.GetPosition cannot be trusted directly: on macOS with a Bluetooth HFP headset
-        // its counter advances ~3.2x faster than data is actually written (clip labeled 16kHz,
-        // counter at ~51k/s). The clip DATA is still genuinely at clip.frequency — playing it at
-        // 1x yields correct-pitch voice, while reading at the counter's pace yields garbled
-        // repeats. So the servo keeps pitch at ~1.0 and uses the counter only after rescaling by
-        // its measured inflation factor k (counterRate / clip.frequency, ~1 on healthy devices) to
-        // estimate the real write head, holding the read head a generous lag behind it with only
-        // tiny pitch trims.
-        private const float PreRollSeconds = 0.3f;        // counter-rate measurement window
-        private const float DefaultTargetLagSec = 0.15f;  // initial read-behind-write lag
-        private const float MinTargetLagSec = 0.10f;
-        private const float MaxTargetLagSec = 0.40f;      // adaptive ceiling (jittery devices)
-        private const float TrimGain = 0.5f;              // proportional gain on relative lag error
-        private const float MaxPitchTrim = 0.03f;         // pitch stays within [0.97, 1.03]
-
-        private readonly GameObject _sourceObject;
+        // Microphone.GetPosition cannot be trusted as a sample position on every platform. On
+        // macOS with a Bluetooth HFP headset, FMOD writes each real 20ms packet of clip.frequency
+        // audio, then advances the position counter ~3.2x too far and zero-fills the skipped
+        // range. The buffer then holds valid fragments of N samples at a stride J (measured: 320
+        // of every 1024) and the counter rate is k = J/N times the data rate. Inspection of a raw
+        // buffer dump showed the fragments are consecutive speech that joins continuously, so the
+        // stream is reconstructed losslessly by reading only the first N = J/k samples of each
+        // stride. Healthy devices have k ~ 1 and use a plain contiguous read.
+        //
+        // The clip's data rate is clip.frequency (verified: fragments play at correct pitch), so
+        // captured samples are resampled from clip.frequency to the fixed native-source rate.
+        private const uint TargetSampleRate = 48000;
+        private const float PreRollSeconds = 0.3f;
+        private const double FragmentedKThreshold = 1.05;
+        private const float MaxBacklogSeconds = 0.2f; // drop backlog beyond this after a stall
+
         private readonly string _deviceName;
 
         public override event Action<float[], int, int> AudioRead;
 
         private bool _disposed = false;
         private bool _started = false;
+        private volatile bool _capturing = false;
+
+        // Streaming linear-resampler state (input = clip.frequency, output = TargetSampleRate).
+        private double _resamplePos;
+        private float _resamplePrev;
 
         /// <summary>
         /// Creates a new microphone source for the given device.
         /// </summary>
         /// <param name="deviceName">The name of the device to capture from. Use <see cref="Microphone.devices"/> to
         /// get the list of available devices.</param>
-        /// <param name="sourceObject">The GameObject to attach the AudioSource to. The object must be kept in the scene
-        /// for the duration of the source's lifetime.</param>
-        public MicrophoneSource(string deviceName, GameObject sourceObject) : base(RtcAudioSourceType.AudioSourceMicrophone)
+        /// <param name="sourceObject">Unused; retained for compatibility. The microphone clip is read
+        /// directly, so no scene GameObject/AudioSource is required.</param>
+        public MicrophoneSource(string deviceName, GameObject sourceObject)
+            : base(RtcAudioSourceType.AudioSourceMicrophone, TargetSampleRate, 1)
         {
             _deviceName = deviceName;
-            _sourceObject = sourceObject;
+        }
+
+        // The rate requested from Microphone.Start (a hint the platform may not honor), clamped to
+        // the device's reported range. The authoritative data rate is clip.frequency afterwards.
+        private static int ResolveRequestedSampleRate(string deviceName)
+        {
+            Microphone.GetDeviceCaps(deviceName, out int minFreq, out int maxFreq);
+            if (minFreq == 0 && maxFreq == 0)
+                return (int)TargetSampleRate;
+            return Mathf.Clamp((int)TargetSampleRate, minFreq, maxFreq);
         }
 
         /// <summary>
@@ -70,7 +84,6 @@ public override void Start()
             base.Start();
             if (_started) return;
 
-
             if (!Application.HasUserAuthorization(mode: UserAuthorization.Microphone))
                 throw new InvalidOperationException("Microphone access not authorized");
 
@@ -80,37 +93,8 @@ public override void Start()
             _started = true;
         }
 
-        // Opens the microphone at the engine's output sample rate when the device supports it, so
-        // the captured clip and the AudioSource that plays it back run at the same rate. A mismatch
-        // makes the looping clip drift against the playback read position and produces choppy audio.
-        // Falls back to DefaultMicrophoneSampleRate when the output rate is unknown, and clamps to
-        // the device's supported range when it reports one.
-        private static int ResolveMicrophoneSampleRate(string deviceName)
-        {
-            int target = AudioSettings.outputSampleRate;
-            if (target <= 0)
-                target = (int)DefaultMicrophoneSampleRate;
-
-            Microphone.GetDeviceCaps(deviceName, out int minFreq, out int maxFreq);
-            // Unity reports (0, 0) when the device imposes no specific sample-rate range.
-            if (minFreq == 0 && maxFreq == 0)
-                return target;
-
-            var result = Mathf.Clamp(target, minFreq, maxFreq);
-            Utils.Info($"ResolveMicrophoneSampleRate: {result}");
-
-            return result;
-        }
-
         private IEnumerator StartMicrophone()
         {
-            // Validate that the GameObject is still valid before starting
-            if (_sourceObject == null)
-            {
-                Utils.Error("MicrophoneSource: GameObject is null, cannot start microphone");
-                yield break;
-            }
-
             // Verify microphone is still authorized (could change during background)
             if (!Application.HasUserAuthorization(UserAuthorization.Microphone))
             {
@@ -119,14 +103,14 @@ private IEnumerator StartMicrophone()
             }
 
             AudioClip clip = null;
-            var micFrequency = ResolveMicrophoneSampleRate(_deviceName);
+            int requestedRate = ResolveRequestedSampleRate(_deviceName);
             try
             {
                 clip = Microphone.Start(
                     _deviceName,
                     loop: true,
                     lengthSec: 2,
-                    frequency: micFrequency
+                    frequency: requestedRate
                 );
             }
             catch (Exception e)
@@ -141,31 +125,6 @@ private IEnumerator StartMicrophone()
                 yield break;
             }
 
-            Utils.Info($"MicrophoneSource device='{_deviceName}' opened at {micFrequency}Hz (output={AudioSettings.outputSampleRate}Hz)");
-
-            // Ensure no duplicate components exist before adding new ones.
-            // This is important during app resume on iOS where components might not be
-            // fully destroyed yet due to Unity's deferred Destroy().
-            var existingSource = _sourceObject.GetComponent<AudioSource>();
-            if (existingSource != null)
-                UnityEngine.Object.DestroyImmediate(existingSource);
-
-            var existingProbe = _sourceObject.GetComponent<AudioProbe>();
-            if (existingProbe != null)
-            {
-                existingProbe.AudioRead -= OnAudioRead;
-                UnityEngine.Object.DestroyImmediate(existingProbe);
-            }
-
-            var source = _sourceObject.AddComponent<AudioSource>();
-            source.clip = clip;
-            source.loop = true;
-
-            var probe = _sourceObject.AddComponent<AudioProbe>();
-            // Clear the audio data after it is read as to not play it through the speaker locally.
-            probe.ClearAfterInvocation();
-            probe.AudioRead += OnAudioRead;
-
             // Wait for microphone to actually start producing data with a timeout
             const float timeout = 2f;
             float elapsed = 0f;
@@ -181,174 +140,155 @@ private IEnumerator StartMicrophone()
                 yield break;
             }
 
-            // Playback is started by the pacing servo, which first measures the clip's true fill
-            // rate so the initial pitch and read position are right from the first sample.
-            MonoBehaviourContext.RunCoroutine(PaceMicrophonePlayback(source, clip));
-#if UNITY_EDITOR
-            MonoBehaviourContext.RunCoroutine(DumpClipOnce(clip));
-#endif
-            Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully");
-        }
-
-#if UNITY_EDITOR
-        // TEMP diagnostic: snapshots the raw mic clip to a WAV so its contents can be inspected
-        // offline — is it one contiguous audio stream, or voice fragments scattered between stale
-        // regions? Speak continuously for the first ~5 seconds of capture. Editor-only.
-        private IEnumerator DumpClipOnce(AudioClip clip)
-        {
-            yield return new WaitForSeconds(4f);
-            if (_disposed || clip == null) yield break;
-            try
-            {
-                var data = new float[clip.samples * clip.channels];
-                clip.GetData(data, 0);
-                var path = System.IO.Path.Combine(System.IO.Path.GetTempPath(), "lk_mic_clip.wav");
-                WriteWav(path, data, clip.channels, clip.frequency);
-                Utils.Info($"MicrophoneSource: dumped clip snapshot to {path} ({clip.samples} frames @ {clip.frequency}Hz/{clip.channels}ch)");
-            }
-            catch (Exception e)
-            {
-                Utils.Warning($"MicrophoneSource: clip dump failed: {e.Message}");
-            }
-        }
+            Utils.Info($"MicrophoneSource device='{_deviceName}' clip={clip.frequency}Hz/{clip.channels}ch samples={clip.samples} requested={requestedRate}Hz target={TargetSampleRate}Hz");
 
-        private static void WriteWav(string path, float[] samples, int channels, int sampleRate)
-        {
-            using var fs = new System.IO.FileStream(path, System.IO.FileMode.Create);
-            using var w = new System.IO.BinaryWriter(fs);
-            int dataBytes = samples.Length * 2;
-            w.Write(System.Text.Encoding.ASCII.GetBytes("RIFF"));
-            w.Write(36 + dataBytes);
-            w.Write(System.Text.Encoding.ASCII.GetBytes("WAVEfmt "));
-            w.Write(16);
-            w.Write((short)1);              // PCM
-            w.Write((short)channels);
-            w.Write(sampleRate);
-            w.Write(sampleRate * channels * 2);
-            w.Write((short)(channels * 2)); // block align
-            w.Write((short)16);             // bits per sample
-            w.Write(System.Text.Encoding.ASCII.GetBytes("data"));
-            w.Write(dataBytes);
-            foreach (var s in samples)
-                w.Write((short)(Mathf.Clamp(s, -1f, 1f) * 32767f));
+            _capturing = true;
+            MonoBehaviourContext.RunCoroutine(CaptureLoop(clip));
         }
-#endif
 
-        // Keeps the AudioSource's read head a fixed lag behind the (estimated) real write head
-        // (see the servo comment at the top of the class). Pitch stays ~1.0 — the clip data rate
-        // IS clip.frequency — with only tiny trims; the added latency is the held lag itself.
-        private IEnumerator PaceMicrophonePlayback(AudioSource source, AudioClip clip)
+        // Reads new samples from the clip's ring buffer each frame and pushes them to the native
+        // source via AudioRead. Runs on the main thread; the native source's queue absorbs the
+        // per-frame pacing jitter.
+        private IEnumerator CaptureLoop(AudioClip clip)
         {
             int clipFrames = clip.samples;
-            int declaredRate = clip.frequency;
+            int channels = clip.channels;
+            int dataRate = clip.frequency > 0 ? clip.frequency : (int)DefaultMicrophoneSampleRate;
 
-            // Pre-roll: measure how fast GetPosition's counter advances. Its instantaneous value
-            // can be jumpy, but its average advance is steady (±0.1% measured), so a short window
-            // gives a reliable rate. k is the counter's inflation relative to the data rate.
+            // Pre-roll: measure how fast the position counter advances (its average is steady even
+            // when individual values jump) and the size of its smallest discrete jump.
             int prevCounter = Microphone.GetPosition(_deviceName);
-            long counterUnwrapped = prevCounter; // counter ran since Microphone.Start; small so far
-            long preRollStart = counterUnwrapped;
+            long advance = 0;
+            long minJump = long.MaxValue;
             var preRoll = System.Diagnostics.Stopwatch.StartNew();
             while (preRoll.Elapsed.TotalSeconds < PreRollSeconds)
             {
-                if (!_started || _disposed || source == null || !Microphone.IsRecording(_deviceName)) yield break;
+                if (!_capturing || _disposed) yield break;
                 yield return null;
                 int c = Microphone.GetPosition(_deviceName);
-                counterUnwrapped += ((c - prevCounter) % clipFrames + clipFrames) % clipFrames;
+                long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames;
                 prevCounter = c;
+                advance += d;
+                if (d > 0 && d < minJump) minJump = d;
             }
-            if (!_started || _disposed || source == null) yield break;
-
-            double counterRate = (counterUnwrapped - preRollStart) / preRoll.Elapsed.TotalSeconds;
-            if (counterRate <= 0) counterRate = declaredRate;
-            double k = counterRate / declaredRate; // ~1 on healthy devices, ~3.2 on macOS BT-HFP
+            if (!_capturing || _disposed) yield break;
 
-            // Lag target, bounded by the clip's data capacity (clipFrames samples).
-            float capacityCapSec = 0.4f * clipFrames / declaredRate;
-            float targetLagSec = Mathf.Min(DefaultTargetLagSec, capacityCapSec);
-            double target = targetLagSec * declaredRate;
+            double counterRate = advance > 0 ? advance / preRoll.Elapsed.TotalSeconds : dataRate;
+            double k = counterRate / dataRate;
 
-            // Estimated real write head in data samples: the counter rescaled by k (both started
-            // at zero when capture began).
-            double writeEst = counterUnwrapped / k;
+            // Fragmented mode: the counter advances in jumps of `stride`, but only the first
+            // `validPerStride` samples of each stride contain data; the rest is zero padding.
+            bool fragmented = k > FragmentedKThreshold && minJump != long.MaxValue && minJump > 1;
+            int stride = fragmented ? (int)minJump : 0;
+            int validPerStride = fragmented ? Math.Max(1, (int)Math.Round(stride / k)) : 0;
 
-            source.pitch = 1f;
-            source.Play();
-            int startRead = (int)((((long)(writeEst - target)) % clipFrames + clipFrames) % clipFrames);
-            source.timeSamples = startRead;
-            int prevRead = startRead;
-            double lag = target; // data samples the reader trails the estimated writer
-            double smoothedLag = lag;
-            double jitter = 0;
-
-            Utils.Info($"MicrophoneSource pacing: counter={counterRate:F0}/s k={k:F2} dataRate={declaredRate}Hz lag={targetLagSec * 1000:F0}ms");
+            if (fragmented)
+                Utils.Info($"MicrophoneSource: fragmented clip detected (k={k:F2}); reading {validPerStride} of every {stride} samples at {dataRate}Hz");
+            else
+                Utils.Info($"MicrophoneSource: contiguous capture (k={k:F2}) at {dataRate}Hz");
 
-            long counterWindow = 0;
-            var rateWindow = System.Diagnostics.Stopwatch.StartNew();
-            var statusWindow = System.Diagnostics.Stopwatch.StartNew();
+            _resamplePos = 0.0;
+            _resamplePrev = 0f;
+            long maxBacklog = (long)(counterRate * MaxBacklogSeconds);
+            int readPos = prevCounter; // counter values land on jump boundaries
+            long pending = 0;
 
-            while (_started && !_disposed && source != null && Microphone.IsRecording(_deviceName))
+            while (_capturing && !_disposed)
             {
                 yield return null;
-                if (source == null) yield break;
 
                 int c = Microphone.GetPosition(_deviceName);
-                int r = source.timeSamples;
-                // Unwrapped per-frame advances. A hitch longer than the clip aliases these; the
-                // resync guard below recovers from the resulting inconsistency.
-                long dc = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames;
-                long dr = ((r - prevRead) % clipFrames + clipFrames) % clipFrames;
+                long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames;
                 prevCounter = c;
-                prevRead = r;
-                counterUnwrapped += dc;
-                counterWindow += dc;
-                lag += dc / k - dr;
+                pending += d;
 
-                smoothedLag = 0.95 * smoothedLag + 0.05 * lag;
-                jitter = 0.95 * jitter + 0.05 * Math.Abs(lag - smoothedLag);
+                // After a long stall, drop the oldest backlog instead of pushing a burst that
+                // would overrun the native source's queue.
+                if (pending > maxBacklog)
+                {
+                    long drop = pending - maxBacklog;
+                    if (fragmented) drop -= drop % stride; // preserve stride alignment
+                    readPos = (int)((readPos + drop) % clipFrames);
+                    pending -= drop;
+                    Utils.Warning($"MicrophoneSource: dropped {drop} buffered samples after a stall");
+                }
 
-                // Refine the counter rate and adapt the lag target once per second.
-                if (rateWindow.Elapsed.TotalSeconds >= 1.0)
+                if (fragmented)
                 {
-                    double instRate = counterWindow / rateWindow.Elapsed.TotalSeconds;
-                    if (instRate > 0)
+                    while (pending >= stride)
                     {
-                        counterRate = 0.7 * counterRate + 0.3 * instRate;
-                        k = counterRate / declaredRate;
+                        EmitClipRange(clip, channels, dataRate, readPos, validPerStride, clipFrames);
+                        readPos = (readPos + stride) % clipFrames;
+                        pending -= stride;
                     }
-                    counterWindow = 0;
-                    rateWindow.Restart();
-
-                    // Hold ~4x the observed jitter as safety margin, within bounds and capacity.
-                    float jitterSec = (float)(jitter / declaredRate);
-                    targetLagSec = Mathf.Min(Mathf.Clamp(jitterSec * 4f, MinTargetLagSec, MaxTargetLagSec), capacityCapSec);
-                    target = targetLagSec * declaredRate;
                 }
-
-                // Tiny proportional pitch trim toward the target lag. The data rate is
-                // clip.frequency, so pitch must stay pinned near 1.
-                double relErr = (smoothedLag - target) / target;
-                relErr = Math.Max(-1.0, Math.Min(1.0, relErr));
-                source.pitch = 1f + Mathf.Clamp((float)(TrimGain * relErr) * MaxPitchTrim, -MaxPitchTrim, MaxPitchTrim);
-
-                // Out of bounds (reader overran the writer, or fell so far behind it reads
-                // overwritten data): jump back to the target lag. Audible once, then stable.
-                if (lag < 0 || lag > clipFrames * 0.9)
+                else if (pending > 0)
                 {
-                    int resyncRead = (int)((((long)(counterUnwrapped / k - target)) % clipFrames + clipFrames) % clipFrames);
-                    source.timeSamples = resyncRead;
-                    prevRead = resyncRead;
-                    lag = target;
-                    smoothedLag = target;
-                    Utils.Warning($"MicrophoneSource pacing: resync, lag reset to {targetLagSec * 1000:F0}ms (k={k:F2} pitch={source.pitch:F3})");
+                    EmitClipRange(clip, channels, dataRate, readPos, (int)pending, clipFrames);
+                    readPos = (int)((readPos + pending) % clipFrames);
+                    pending = 0;
                 }
+            }
+        }
+
+        // Reads `count` frames starting at `start`, splitting at the ring wrap so each GetData
+        // read is contiguous.
+        private void EmitClipRange(AudioClip clip, int channels, int dataRate, int start, int count, int clipFrames)
+        {
+            if (count <= 0) return;
+            int first = Math.Min(count, clipFrames - start);
+            ReadAndPush(clip, channels, dataRate, start, first);
+            if (count > first)
+                ReadAndPush(clip, channels, dataRate, 0, count - first);
+        }
 
-                if (statusWindow.Elapsed.TotalSeconds >= 5.0)
+        // Reads a contiguous range, downmixes to mono, resamples dataRate -> TargetSampleRate
+        // (streaming linear interpolation carrying state across calls, so fragment junctions stay
+        // continuous), and fires AudioRead.
+        private void ReadAndPush(AudioClip clip, int channels, int dataRate, int start, int count)
+        {
+            if (count <= 0) return;
+
+            var interleaved = new float[count * channels];
+            clip.GetData(interleaved, start);
+
+            float[] mono;
+            if (channels == 1)
+            {
+                mono = interleaved;
+            }
+            else
+            {
+                mono = new float[count];
+                for (int f = 0; f < count; f++)
                 {
-                    Utils.Info($"MicrophoneSource pacing: k={k:F2} pitch={source.pitch:F3} lag={smoothedLag / declaredRate * 1000:F0}ms target={targetLagSec * 1000:F0}ms jitter={jitter / declaredRate * 1000:F1}ms");
-                    statusWindow.Restart();
+                    float sum = 0f;
+                    for (int ch = 0; ch < channels; ch++)
+                        sum += interleaved[f * channels + ch];
+                    mono[f] = sum / channels;
                 }
             }
+
+            double step = (double)dataRate / TargetSampleRate;
+            var output = new List<float>((int)(count / step) + 2);
+
+            // Index -1 maps to the carried last sample of the previous chunk so interpolation is
+            // continuous across chunk boundaries. pos stays >= -1.
+            double pos = _resamplePos;
+            while (pos < count - 1)
+            {
+                int i0 = (int)Math.Floor(pos);
+                float a = i0 < 0 ? _resamplePrev : mono[i0];
+                float b = mono[i0 + 1];
+                float frac = (float)(pos - i0);
+                output.Add(a * (1f - frac) + b * frac);
+                pos += step;
+            }
+            _resamplePrev = mono[count - 1];
+            _resamplePos = pos - count;
+
+            if (output.Count > 0)
+                AudioRead?.Invoke(output.ToArray(), 1, (int)TargetSampleRate);
         }
 
         /// <summary>
@@ -364,33 +304,15 @@ public override void Stop()
 
         private IEnumerator StopMicrophone()
         {
+            _capturing = false;
+
             if (Microphone.IsRecording(_deviceName))
                 Microphone.End(_deviceName);
 
-            // Check if GameObject is still valid before trying to access components
-            if (_sourceObject != null)
-            {
-                var probe = _sourceObject.GetComponent<AudioProbe>();
-                if (probe != null)
-                {
-                    probe.AudioRead -= OnAudioRead;
-                    UnityEngine.Object.Destroy(probe);
-                }
-
-                var source = _sourceObject.GetComponent<AudioSource>();
-                if (source != null)
-                    UnityEngine.Object.Destroy(source);
-            }
-
             Utils.Debug($"MicrophoneSource device='{_deviceName}' stopped");
             yield return null;
         }
 
-        private void OnAudioRead(float[] data, int channels, int sampleRate)
-        {
-            AudioRead?.Invoke(data, channels, sampleRate);
-        }
-
         private void OnApplicationPause(bool pause)
         {
             if (!_started)
@@ -458,4 +380,4 @@ protected override void Dispose(bool disposing)
             Dispose(false);
         }
     }
-}
\ No newline at end of file
+}