From 2273834aaf5a51003c3b83ee0952e7ab1da40deb Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:16:43 +0200 Subject: [PATCH 1/5] Fix microphone capture: device-true source format + fragment-aware clip reading Publishing the microphone with a Bluetooth HFP headset on macOS produced "sample_rate and num_channels don't match" errors from the native source and, beyond that, persistently choppy or garbled audio on receivers. Two root causes, both fixed here: 1. The native (Rust) audio source was created with a hardcoded format (48000Hz/2ch) while captured frames arrive at whatever format the device actually delivers. The native source rejects mismatched frames (it does not resample). RtcAudioSource now has two constructors: a device-mode one that resolves the format from Unity's output configuration, and an explicit-format one for sources that know their exact rate/channels. Frames that still mismatch are dropped with a throttled warning instead of erroring natively. 2. On macOS with a Bluetooth HFP headset, Unity's Microphone clip buffer is fragmented: FMOD writes each real 20ms packet of clip.frequency audio, then advances Microphone.GetPosition as if it had written ~3.2x as much, zero-filling the skipped range. A raw buffer dump showed valid fragments of exactly 320 samples at a stride of exactly 1024 (= 1/k where k is the counter inflation), with the fragments joining continuously - the stream is intact, just scattered. Every playback-based capture strategy therefore chops (31% voice, 69% padding) and counter-paced reading garbles. MicrophoneSource now reads the clip ring buffer directly (no AudioSource, no OnAudioFilterRead - which also decouples capture from the output device's clock). A short pre-roll measures the counter rate (k = counterRate / clip.frequency) and the counter's smallest discrete jump (the stride). Healthy devices (k ~ 1) use a plain contiguous read; fragmented devices (k > 1.05) read only the first stride/k samples of each stride - exactly the valid fragments. Captured audio is downmixed to mono and resampled from clip.frequency to a fixed 48kHz native source, preserving the publish-before-start contract. Backlog beyond 200ms after a stall is dropped, stride-aligned, to avoid overrunning the native queue. Also removes the redundant Microphone.Start in the Meet sample and lets the test sine source declare its exact format explicitly. Verified end-to-end: macOS publisher with the Bluetooth headset microphone to an Android receiver now sounds clean and correct-pitch; healthy microphones take the contiguous path unchanged. Co-Authored-By: Claude Fable 5 --- Runtime/Scripts/BasicAudioSource.cs | 6 +- Runtime/Scripts/MicrophoneSource.cs | 256 +++++++++++++++----- Runtime/Scripts/RtcAudioSource.cs | 81 ++++++- Samples~/Meet/Assets/Runtime/MeetManager.cs | 3 +- Tests/PlayMode/Utils/SineWaveAudioSource.cs | 2 +- 5 files changed, 273 insertions(+), 75 deletions(-) diff --git a/Runtime/Scripts/BasicAudioSource.cs b/Runtime/Scripts/BasicAudioSource.cs index 3b63680b..8193090d 100644 --- a/Runtime/Scripts/BasicAudioSource.cs +++ b/Runtime/Scripts/BasicAudioSource.cs @@ -19,9 +19,11 @@ sealed public class BasicAudioSource : RtcAudioSource /// Creates a new basic audio source for the given in the scene. /// /// The to capture from. - /// The number of channels to capture. /// The type of audio source. - public BasicAudioSource(AudioSource source, int channels = 2, RtcAudioSourceType sourceType = RtcAudioSourceType.AudioSourceCustom) : base(channels, sourceType) + /// + /// The sample rate and channel count are taken from Unity's audio configuration. + /// + public BasicAudioSource(AudioSource source, RtcAudioSourceType sourceType = RtcAudioSourceType.AudioSourceCustom) : base(sourceType) { _source = source; } diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index 904b8da7..1a1823f0 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -1,5 +1,6 @@ using System; using System.Collections; +using System.Collections.Generic; using UnityEngine; using LiveKit.Internal; @@ -13,25 +14,59 @@ namespace LiveKit /// sealed public class MicrophoneSource : RtcAudioSource { - private readonly GameObject _sourceObject; + // --- Capture design --- + // The microphone clip's ring buffer is read directly (no AudioSource playback, no + // OnAudioFilterRead), so capture is decoupled from the output device's clock. + // + // Microphone.GetPosition cannot be trusted as a sample position on every platform. On + // macOS with a Bluetooth HFP headset, FMOD writes each real 20ms packet of clip.frequency + // audio, then advances the position counter ~3.2x too far and zero-fills the skipped + // range. The buffer then holds valid fragments of N samples at a stride J (measured: 320 + // of every 1024) and the counter rate is k = J/N times the data rate. Inspection of a raw + // buffer dump showed the fragments are consecutive speech that joins continuously, so the + // stream is reconstructed losslessly by reading only the first N = J/k samples of each + // stride. Healthy devices have k ~ 1 and use a plain contiguous read. + // + // The clip's data rate is clip.frequency (verified: fragments play at correct pitch), so + // captured samples are resampled from clip.frequency to the fixed native-source rate. + private const uint TargetSampleRate = 48000; + private const float PreRollSeconds = 0.3f; + private const double FragmentedKThreshold = 1.05; + private const float MaxBacklogSeconds = 0.2f; // drop backlog beyond this after a stall + private readonly string _deviceName; public override event Action AudioRead; private bool _disposed = false; private bool _started = false; + private volatile bool _capturing = false; + + // Streaming linear-resampler state (input = clip.frequency, output = TargetSampleRate). + private double _resamplePos; + private float _resamplePrev; /// /// Creates a new microphone source for the given device. /// /// The name of the device to capture from. Use to /// get the list of available devices. - /// The GameObject to attach the AudioSource to. The object must be kept in the scene - /// for the duration of the source's lifetime. - public MicrophoneSource(string deviceName, GameObject sourceObject) : base(2, RtcAudioSourceType.AudioSourceMicrophone) + /// Unused; retained for compatibility. The microphone clip is read + /// directly, so no scene GameObject/AudioSource is required. + public MicrophoneSource(string deviceName, GameObject sourceObject) + : base(RtcAudioSourceType.AudioSourceMicrophone, TargetSampleRate, 1) { _deviceName = deviceName; - _sourceObject = sourceObject; + } + + // The rate requested from Microphone.Start (a hint the platform may not honor), clamped to + // the device's reported range. The authoritative data rate is clip.frequency afterwards. + private static int ResolveRequestedSampleRate(string deviceName) + { + Microphone.GetDeviceCaps(deviceName, out int minFreq, out int maxFreq); + if (minFreq == 0 && maxFreq == 0) + return (int)TargetSampleRate; + return Mathf.Clamp((int)TargetSampleRate, minFreq, maxFreq); } /// @@ -49,7 +84,6 @@ public override void Start() base.Start(); if (_started) return; - if (!Application.HasUserAuthorization(mode: UserAuthorization.Microphone)) throw new InvalidOperationException("Microphone access not authorized"); @@ -61,13 +95,6 @@ public override void Start() private IEnumerator StartMicrophone() { - // Validate that the GameObject is still valid before starting - if (_sourceObject == null) - { - Utils.Error("MicrophoneSource: GameObject is null, cannot start microphone"); - yield break; - } - // Verify microphone is still authorized (could change during background) if (!Application.HasUserAuthorization(UserAuthorization.Microphone)) { @@ -76,13 +103,14 @@ private IEnumerator StartMicrophone() } AudioClip clip = null; + int requestedRate = ResolveRequestedSampleRate(_deviceName); try { clip = Microphone.Start( _deviceName, loop: true, - lengthSec: 1, - frequency: (int)DefaultMicrophoneSampleRate + lengthSec: 2, + frequency: requestedRate ); } catch (Exception e) @@ -97,29 +125,6 @@ private IEnumerator StartMicrophone() yield break; } - // Ensure no duplicate components exist before adding new ones. - // This is important during app resume on iOS where components might not be - // fully destroyed yet due to Unity's deferred Destroy(). - var existingSource = _sourceObject.GetComponent(); - if (existingSource != null) - UnityEngine.Object.DestroyImmediate(existingSource); - - var existingProbe = _sourceObject.GetComponent(); - if (existingProbe != null) - { - existingProbe.AudioRead -= OnAudioRead; - UnityEngine.Object.DestroyImmediate(existingProbe); - } - - var source = _sourceObject.AddComponent(); - source.clip = clip; - source.loop = true; - - var probe = _sourceObject.AddComponent(); - // Clear the audio data after it is read as to not play it through the speaker locally. - probe.ClearAfterInvocation(); - probe.AudioRead += OnAudioRead; - // Wait for microphone to actually start producing data with a timeout const float timeout = 2f; float elapsed = 0f; @@ -135,8 +140,155 @@ private IEnumerator StartMicrophone() yield break; } - source.Play(); - Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully"); + Utils.Info($"MicrophoneSource device='{_deviceName}' clip={clip.frequency}Hz/{clip.channels}ch samples={clip.samples} requested={requestedRate}Hz target={TargetSampleRate}Hz"); + + _capturing = true; + MonoBehaviourContext.RunCoroutine(CaptureLoop(clip)); + } + + // Reads new samples from the clip's ring buffer each frame and pushes them to the native + // source via AudioRead. Runs on the main thread; the native source's queue absorbs the + // per-frame pacing jitter. + private IEnumerator CaptureLoop(AudioClip clip) + { + int clipFrames = clip.samples; + int channels = clip.channels; + int dataRate = clip.frequency > 0 ? clip.frequency : (int)DefaultMicrophoneSampleRate; + + // Pre-roll: measure how fast the position counter advances (its average is steady even + // when individual values jump) and the size of its smallest discrete jump. + int prevCounter = Microphone.GetPosition(_deviceName); + long advance = 0; + long minJump = long.MaxValue; + var preRoll = System.Diagnostics.Stopwatch.StartNew(); + while (preRoll.Elapsed.TotalSeconds < PreRollSeconds) + { + if (!_capturing || _disposed) yield break; + yield return null; + int c = Microphone.GetPosition(_deviceName); + long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; + prevCounter = c; + advance += d; + if (d > 0 && d < minJump) minJump = d; + } + if (!_capturing || _disposed) yield break; + + double counterRate = advance > 0 ? advance / preRoll.Elapsed.TotalSeconds : dataRate; + double k = counterRate / dataRate; + + // Fragmented mode: the counter advances in jumps of `stride`, but only the first + // `validPerStride` samples of each stride contain data; the rest is zero padding. + bool fragmented = k > FragmentedKThreshold && minJump != long.MaxValue && minJump > 1; + int stride = fragmented ? (int)minJump : 0; + int validPerStride = fragmented ? Math.Max(1, (int)Math.Round(stride / k)) : 0; + + if (fragmented) + Utils.Info($"MicrophoneSource: fragmented clip detected (k={k:F2}); reading {validPerStride} of every {stride} samples at {dataRate}Hz"); + else + Utils.Info($"MicrophoneSource: contiguous capture (k={k:F2}) at {dataRate}Hz"); + + _resamplePos = 0.0; + _resamplePrev = 0f; + long maxBacklog = (long)(counterRate * MaxBacklogSeconds); + int readPos = prevCounter; // counter values land on jump boundaries + long pending = 0; + + while (_capturing && !_disposed) + { + yield return null; + + int c = Microphone.GetPosition(_deviceName); + long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; + prevCounter = c; + pending += d; + + // After a long stall, drop the oldest backlog instead of pushing a burst that + // would overrun the native source's queue. + if (pending > maxBacklog) + { + long drop = pending - maxBacklog; + if (fragmented) drop -= drop % stride; // preserve stride alignment + readPos = (int)((readPos + drop) % clipFrames); + pending -= drop; + Utils.Warning($"MicrophoneSource: dropped {drop} buffered samples after a stall"); + } + + if (fragmented) + { + while (pending >= stride) + { + EmitClipRange(clip, channels, dataRate, readPos, validPerStride, clipFrames); + readPos = (readPos + stride) % clipFrames; + pending -= stride; + } + } + else if (pending > 0) + { + EmitClipRange(clip, channels, dataRate, readPos, (int)pending, clipFrames); + readPos = (int)((readPos + pending) % clipFrames); + pending = 0; + } + } + } + + // Reads `count` frames starting at `start`, splitting at the ring wrap so each GetData + // read is contiguous. + private void EmitClipRange(AudioClip clip, int channels, int dataRate, int start, int count, int clipFrames) + { + if (count <= 0) return; + int first = Math.Min(count, clipFrames - start); + ReadAndPush(clip, channels, dataRate, start, first); + if (count > first) + ReadAndPush(clip, channels, dataRate, 0, count - first); + } + + // Reads a contiguous range, downmixes to mono, resamples dataRate -> TargetSampleRate + // (streaming linear interpolation carrying state across calls, so fragment junctions stay + // continuous), and fires AudioRead. + private void ReadAndPush(AudioClip clip, int channels, int dataRate, int start, int count) + { + if (count <= 0) return; + + var interleaved = new float[count * channels]; + clip.GetData(interleaved, start); + + float[] mono; + if (channels == 1) + { + mono = interleaved; + } + else + { + mono = new float[count]; + for (int f = 0; f < count; f++) + { + float sum = 0f; + for (int ch = 0; ch < channels; ch++) + sum += interleaved[f * channels + ch]; + mono[f] = sum / channels; + } + } + + double step = (double)dataRate / TargetSampleRate; + var output = new List((int)(count / step) + 2); + + // Index -1 maps to the carried last sample of the previous chunk so interpolation is + // continuous across chunk boundaries. pos stays >= -1. + double pos = _resamplePos; + while (pos < count - 1) + { + int i0 = (int)Math.Floor(pos); + float a = i0 < 0 ? _resamplePrev : mono[i0]; + float b = mono[i0 + 1]; + float frac = (float)(pos - i0); + output.Add(a * (1f - frac) + b * frac); + pos += step; + } + _resamplePrev = mono[count - 1]; + _resamplePos = pos - count; + + if (output.Count > 0) + AudioRead?.Invoke(output.ToArray(), 1, (int)TargetSampleRate); } /// @@ -152,33 +304,15 @@ public override void Stop() private IEnumerator StopMicrophone() { + _capturing = false; + if (Microphone.IsRecording(_deviceName)) Microphone.End(_deviceName); - // Check if GameObject is still valid before trying to access components - if (_sourceObject != null) - { - var probe = _sourceObject.GetComponent(); - if (probe != null) - { - probe.AudioRead -= OnAudioRead; - UnityEngine.Object.Destroy(probe); - } - - var source = _sourceObject.GetComponent(); - if (source != null) - UnityEngine.Object.Destroy(source); - } - Utils.Debug($"MicrophoneSource device='{_deviceName}' stopped"); yield return null; } - private void OnAudioRead(float[] data, int channels, int sampleRate) - { - AudioRead?.Invoke(data, channels, sampleRate); - } - private void OnApplicationPause(bool pause) { if (!_started) @@ -246,4 +380,4 @@ protected override void Dispose(bool disposing) Dispose(false); } } -} \ No newline at end of file +} diff --git a/Runtime/Scripts/RtcAudioSource.cs b/Runtime/Scripts/RtcAudioSource.cs index a9af8a0a..43f5c102 100644 --- a/Runtime/Scripts/RtcAudioSource.cs +++ b/Runtime/Scripts/RtcAudioSource.cs @@ -83,20 +83,33 @@ private sealed class PendingAudioFrame private volatile bool _disposed = false; private int _audioReadCount = 0; - protected RtcAudioSource(int channels = 2, RtcAudioSourceType audioSourceType = RtcAudioSourceType.AudioSourceCustom) + // Device-capture sources (microphone, AudioSource taps) don't know their format ahead of + // time — it is whatever Unity's audio graph delivers. They use this constructor, which + // configures the native source from Unity's current output configuration. + protected RtcAudioSource(RtcAudioSourceType audioSourceType) + : this(audioSourceType, 0, 0) { } + + // Sources that generate a fixed, known format (e.g. test signal generators) declare it + // directly. Passing 0 for either value falls back to the device configuration. + protected RtcAudioSource(RtcAudioSourceType audioSourceType, uint sampleRate, uint channels) { _sourceType = audioSourceType; - _expectedChannels = (uint)channels; + + if (sampleRate > 0 && channels > 0) + { + _expectedSampleRate = sampleRate; + _expectedChannels = channels; + } + else + { + (_expectedSampleRate, _expectedChannels) = ResolveDeviceFormat(); + } using var request = FFIBridge.Instance.NewRequest(); var newAudioSource = request.request; newAudioSource.Type = AudioSourceType.AudioSourceNative; - newAudioSource.NumChannels = (uint)channels; - newAudioSource.SampleRate = _sourceType == RtcAudioSourceType.AudioSourceMicrophone ? - DefaultMicrophoneSampleRate : DefaultSampleRate; - _expectedSampleRate = newAudioSource.SampleRate; - - Utils.Debug($"NewAudioSource: {newAudioSource.NumChannels} {newAudioSource.SampleRate}"); + newAudioSource.NumChannels = _expectedChannels; + newAudioSource.SampleRate = _expectedSampleRate; newAudioSource.Options = request.TempResource(); newAudioSource.Options.EchoCancellation = true; @@ -109,6 +122,49 @@ protected RtcAudioSource(int channels = 2, RtcAudioSourceType audioSourceType = Utils.Debug($"{DebugTag} created handle={Handle.DangerousGetHandle()} expectedRate={_expectedSampleRate} expectedChannels={_expectedChannels} sourceType={_sourceType}"); } + // Reads Unity's actual output audio configuration. The capture path delivers buffers at the + // DSP output rate/channel count (see AudioProbe), so this is the format the native source + // must match. Falls back to the platform defaults when Unity cannot report a configuration + // (e.g. batch mode without an audio device). + private (uint sampleRate, uint channels) ResolveDeviceFormat() + { + uint sampleRate = _sourceType == RtcAudioSourceType.AudioSourceMicrophone + ? DefaultMicrophoneSampleRate + : DefaultSampleRate; + uint channels = DefaultChannels; + + try + { + var config = UnityEngine.AudioSettings.GetConfiguration(); + if (config.sampleRate > 0) + sampleRate = (uint)config.sampleRate; + var configuredChannels = SpeakerModeChannels(config.speakerMode); + if (configuredChannels > 0) + channels = configuredChannels; + } + catch (Exception e) + { + Utils.Warning($"{DebugTag} could not read Unity audio configuration, using defaults: {e.Message}"); + } + + return (sampleRate, channels); + } + + private static uint SpeakerModeChannels(UnityEngine.AudioSpeakerMode mode) + { + switch (mode) + { + case UnityEngine.AudioSpeakerMode.Mono: return 1; + case UnityEngine.AudioSpeakerMode.Stereo: return 2; + case UnityEngine.AudioSpeakerMode.Quad: return 4; + case UnityEngine.AudioSpeakerMode.Surround: return 5; + case UnityEngine.AudioSpeakerMode.Mode5point1: return 6; + case UnityEngine.AudioSpeakerMode.Mode7point1: return 8; + case UnityEngine.AudioSpeakerMode.Prologic: return 2; + default: return 0; + } + } + /// /// Begin capturing audio samples from the underlying source. /// @@ -153,9 +209,16 @@ private void OnAudioRead(float[] data, int channels, int sampleRate) return; } + // The native source rejects frames whose rate/channels differ from how it was + // configured (it does not resample). This should not happen now that sources declare + // or resolve their real format, but if Unity reports an inconsistent format — or the + // output configuration changes at runtime — we drop the frame instead of sending a + // mismatch the native side would error on. if ((uint)sampleRate != _expectedSampleRate || (uint)channels != _expectedChannels) { - Utils.Warning($"{DebugTag} audio frame #{frameIndex} metadata mismatch actualRate={sampleRate} actualChannels={channels} expectedRate={_expectedSampleRate} expectedChannels={_expectedChannels} sourceType={_sourceType}"); + if (frameIndex == 1 || frameIndex % 100 == 0) + Utils.Warning($"{DebugTag} dropping audio frame #{frameIndex}: format {sampleRate}/{channels} does not match source {_expectedSampleRate}/{_expectedChannels} (sourceType={_sourceType})"); + return; } var pendingBeforeSend = PendingFrameCount(); diff --git a/Samples~/Meet/Assets/Runtime/MeetManager.cs b/Samples~/Meet/Assets/Runtime/MeetManager.cs index 225c7a0c..97b2cb70 100644 --- a/Samples~/Meet/Assets/Runtime/MeetManager.cs +++ b/Samples~/Meet/Assets/Runtime/MeetManager.cs @@ -453,8 +453,7 @@ private IEnumerator PublishLocalMicrophone() { if (_audioObjects.ContainsKey(LocalAudioTrackName)) yield break; - Microphone.Start(null, true, 10, 44100); - + // MicrophoneSource starts the device itself, so we only need the device name here. var audioObject = new GameObject($"My Microphone: {Microphone.devices[0]}"); audioObject.transform.SetParent(_audioTrackParent); diff --git a/Tests/PlayMode/Utils/SineWaveAudioSource.cs b/Tests/PlayMode/Utils/SineWaveAudioSource.cs index 907e9ccc..2337615b 100644 --- a/Tests/PlayMode/Utils/SineWaveAudioSource.cs +++ b/Tests/PlayMode/Utils/SineWaveAudioSource.cs @@ -31,7 +31,7 @@ public SineWaveAudioSource( int sampleRate = 48000, double frequencyHz = 440.0, float amplitude = 0.1f) - : base(channels, RtcAudioSourceType.AudioSourceCustom) + : base(RtcAudioSourceType.AudioSourceCustom, (uint)sampleRate, (uint)channels) { _channels = channels; _sampleRate = sampleRate; From 35031cc647393c6e583c47bc1cc5d8424eba418f Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:39:12 +0200 Subject: [PATCH 2/5] Extract mic clip reading logic into testable classes + EditMode tests The fragment-aware capture logic is subtle and was painful to diagnose, but most of it is pure logic that doesn't need a microphone. Extract it from MicrophoneSource into two UnityEngine-free internal classes: - MicClipReader: pre-roll measurement (counter rate k, smallest jump = stride), contiguous vs fragmented mode selection, per-stride valid-range emission, ring-wrap splitting, and stride-aligned backlog dropping. - StreamingResampler: the streaming linear resampler (state carries across chunks so fragment junctions stay continuous). MicrophoneSource.CaptureLoop becomes a thin Unity shell: poll GetPosition, feed the reader, GetData the emitted ranges, downmix, resample, push. Behavior is unchanged. Add EditMode tests covering: healthy contiguous capture (k~1, every sample emitted), fragmented detection (k=3.2, stride 1024, valid 320 - the exact structure dumped from the Sony MDR-1000X on macOS), lossless reconstruction of a synthetic fragmented buffer across multiple ring laps (strictly sequential output, no gaps/repeats/padding), stride-aligned backlog drops bounded by the limit, pre-roll emitting nothing, resampler frequency/length preservation, and chunked-equals-whole resampling (1-sample tail tolerance for float boundary rounding). Logic verified by executing all test scenarios in a standalone harness (mono) in addition to compiling the Unity assemblies. Co-Authored-By: Claude Fable 5 --- Runtime/Scripts/Internal/MicClipReader.cs | 160 ++++++++++++ .../Scripts/Internal/StreamingResampler.cs | 54 ++++ Runtime/Scripts/MicrophoneSource.cs | 142 +++------- Tests/EditMode/MicClipReaderTests.cs | 243 ++++++++++++++++++ 4 files changed, 490 insertions(+), 109 deletions(-) create mode 100644 Runtime/Scripts/Internal/MicClipReader.cs create mode 100644 Runtime/Scripts/Internal/StreamingResampler.cs create mode 100644 Tests/EditMode/MicClipReaderTests.cs diff --git a/Runtime/Scripts/Internal/MicClipReader.cs b/Runtime/Scripts/Internal/MicClipReader.cs new file mode 100644 index 00000000..cb04b9fa --- /dev/null +++ b/Runtime/Scripts/Internal/MicClipReader.cs @@ -0,0 +1,160 @@ +using System; +using System.Collections.Generic; + +namespace LiveKit.Internal +{ + /// + /// Pure logic for reading a looping microphone clip ring buffer whose position counter may be + /// unreliable. Free of UnityEngine dependencies so it can be unit tested. + /// + /// + /// On most devices the position counter advances at the clip's data rate and capture is a + /// plain contiguous read. On macOS with a Bluetooth HFP headset, however, FMOD writes each + /// real packet of clip.frequency audio and then advances the counter k (~3.2) times too far, + /// zero-filling the skipped range: the buffer holds valid fragments of N samples at a stride + /// J (measured: 320 of every 1024), the fragments join continuously, and the counter rate is + /// k = J/N times the data rate. The reader measures the counter rate and its smallest + /// discrete jump during a pre-roll; when the rate is inflated it emits only the first J/k + /// samples of each stride, reconstructing the contiguous stream. + /// + internal sealed class MicClipReader + { + public struct ReadRange + { + public int Start; + public int Count; + } + + private readonly int _clipFrames; + private readonly int _dataRate; + private readonly double _preRollSeconds; + private readonly double _fragmentedKThreshold; + private readonly double _maxBacklogSeconds; + + private bool _hasFirstSample; + private int _prevCounter; + private double _preRollStart; + private long _preRollAdvance; + private long _minJump = long.MaxValue; + + private long _maxBacklog; + private int _readPos; + private long _pending; + + /// False during the pre-roll measurement window; no ranges are emitted until ready. + public bool Ready { get; private set; } + + /// True when the counter rate is inflated and only part of each stride holds data. + public bool Fragmented { get; private set; } + + /// Counter samples per fragment cycle (0 when not fragmented). + public int Stride { get; private set; } + + /// Valid data samples at the start of each stride (0 when not fragmented). + public int ValidPerStride { get; private set; } + + /// Measured counter advance per second. + public double CounterRate { get; private set; } + + /// Counter inflation factor: CounterRate / dataRate (~1 on healthy devices). + public double K { get; private set; } + + /// Total counter samples discarded by backlog drops (e.g. after a stall). + public long TotalDropped { get; private set; } + + public MicClipReader(int clipFrames, int dataRate, + double preRollSeconds = 0.3, double fragmentedKThreshold = 1.05, double maxBacklogSeconds = 0.2) + { + if (clipFrames <= 0) throw new ArgumentOutOfRangeException(nameof(clipFrames)); + if (dataRate <= 0) throw new ArgumentOutOfRangeException(nameof(dataRate)); + _clipFrames = clipFrames; + _dataRate = dataRate; + _preRollSeconds = preRollSeconds; + _fragmentedKThreshold = fragmentedKThreshold; + _maxBacklogSeconds = maxBacklogSeconds; + } + + /// + /// Feeds the current counter position at a monotonic time and appends the ranges that + /// should be read from the clip (already split at the ring wrap) to . + /// + public void Update(int counterPosition, double elapsedSeconds, List ranges) + { + if (!_hasFirstSample) + { + _hasFirstSample = true; + _prevCounter = counterPosition; + _preRollStart = elapsedSeconds; + return; + } + + long d = ((counterPosition - _prevCounter) % _clipFrames + _clipFrames) % _clipFrames; + _prevCounter = counterPosition; + + if (!Ready) + { + _preRollAdvance += d; + if (d > 0 && d < _minJump) _minJump = d; + double window = elapsedSeconds - _preRollStart; + if (window >= _preRollSeconds) + FinishPreRoll(window); + return; + } + + _pending += d; + + // After a long stall, drop the oldest backlog instead of pushing a burst that would + // overrun the consumer. (A stall longer than one counter lap aliases the unwrapped + // advance and silently loses whole laps; this bound covers everything observable.) + if (_pending > _maxBacklog) + { + long drop = _pending - _maxBacklog; + if (Fragmented) drop -= drop % Stride; // preserve stride alignment + if (drop > 0) + { + _readPos = (int)((_readPos + drop) % _clipFrames); + _pending -= drop; + TotalDropped += drop; + } + } + + if (Fragmented) + { + while (_pending >= Stride) + { + EmitSplit(_readPos, ValidPerStride, ranges); + _readPos = (_readPos + Stride) % _clipFrames; + _pending -= Stride; + } + } + else if (_pending > 0) + { + EmitSplit(_readPos, (int)_pending, ranges); + _readPos = (int)((_readPos + _pending) % _clipFrames); + _pending = 0; + } + } + + private void FinishPreRoll(double windowSeconds) + { + CounterRate = _preRollAdvance > 0 ? _preRollAdvance / windowSeconds : _dataRate; + K = CounterRate / _dataRate; + Fragmented = K > _fragmentedKThreshold && _minJump != long.MaxValue && _minJump > 1; + Stride = Fragmented ? (int)_minJump : 0; + ValidPerStride = Fragmented ? Math.Max(1, (int)Math.Round(Stride / K)) : 0; + _maxBacklog = (long)(CounterRate * _maxBacklogSeconds); + _readPos = _prevCounter; // counter values land on jump boundaries + _pending = 0; + Ready = true; + } + + private void EmitSplit(int start, int count, List ranges) + { + if (count <= 0) return; + int first = Math.Min(count, _clipFrames - start); + ranges.Add(new ReadRange { Start = start, Count = first }); + if (count > first) + ranges.Add(new ReadRange { Start = 0, Count = count - first }); + } + } +} diff --git a/Runtime/Scripts/Internal/StreamingResampler.cs b/Runtime/Scripts/Internal/StreamingResampler.cs new file mode 100644 index 00000000..dd9be2d6 --- /dev/null +++ b/Runtime/Scripts/Internal/StreamingResampler.cs @@ -0,0 +1,54 @@ +using System; +using System.Collections.Generic; + +namespace LiveKit.Internal +{ + /// + /// Streaming linear resampler for mono audio. Interpolation state carries across chunks, so a + /// stream processed in arbitrary slices produces the same output as processing it whole. + /// Free of UnityEngine dependencies so it can be unit tested. + /// + internal sealed class StreamingResampler + { + private readonly double _step; // input samples advanced per output sample + private double _pos; // fractional read position; >= -1, where -1 maps to _prev + private float _prev; // last sample of the previous chunk + + public StreamingResampler(int inputRate, int outputRate) + { + if (inputRate <= 0) throw new ArgumentOutOfRangeException(nameof(inputRate)); + if (outputRate <= 0) throw new ArgumentOutOfRangeException(nameof(outputRate)); + _step = (double)inputRate / outputRate; + } + + public void Reset() + { + _pos = 0.0; + _prev = 0f; + } + + /// + /// Resamples the first samples of and + /// returns the produced output samples (possibly empty for very small chunks). + /// + public float[] Process(float[] input, int count) + { + if (count <= 0) return Array.Empty(); + + var output = new List((int)(count / _step) + 2); + double pos = _pos; + while (pos < count - 1) + { + int i0 = (int)Math.Floor(pos); + float a = i0 < 0 ? _prev : input[i0]; + float b = input[i0 + 1]; + float frac = (float)(pos - i0); + output.Add(a * (1f - frac) + b * frac); + pos += _step; + } + _prev = input[count - 1]; + _pos = pos - count; + return output.ToArray(); + } + } +} diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index 1a1823f0..89f3f68f 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -18,14 +18,9 @@ sealed public class MicrophoneSource : RtcAudioSource // The microphone clip's ring buffer is read directly (no AudioSource playback, no // OnAudioFilterRead), so capture is decoupled from the output device's clock. // - // Microphone.GetPosition cannot be trusted as a sample position on every platform. On - // macOS with a Bluetooth HFP headset, FMOD writes each real 20ms packet of clip.frequency - // audio, then advances the position counter ~3.2x too far and zero-fills the skipped - // range. The buffer then holds valid fragments of N samples at a stride J (measured: 320 - // of every 1024) and the counter rate is k = J/N times the data rate. Inspection of a raw - // buffer dump showed the fragments are consecutive speech that joins continuously, so the - // stream is reconstructed losslessly by reading only the first N = J/k samples of each - // stride. Healthy devices have k ~ 1 and use a plain contiguous read. + // Microphone.GetPosition cannot be trusted as a sample position on every platform; see + // MicClipReader for the fragmented-buffer model (macOS + Bluetooth HFP) and how the + // contiguous stream is reconstructed from it. // // The clip's data rate is clip.frequency (verified: fragments play at correct pitch), so // captured samples are resampled from clip.frequency to the fixed native-source rate. @@ -42,9 +37,7 @@ sealed public class MicrophoneSource : RtcAudioSource private bool _started = false; private volatile bool _capturing = false; - // Streaming linear-resampler state (input = clip.frequency, output = TargetSampleRate). - private double _resamplePos; - private float _resamplePrev; + private StreamingResampler _resampler; /// /// Creates a new microphone source for the given device. @@ -147,105 +140,53 @@ private IEnumerator StartMicrophone() } // Reads new samples from the clip's ring buffer each frame and pushes them to the native - // source via AudioRead. Runs on the main thread; the native source's queue absorbs the - // per-frame pacing jitter. + // source via AudioRead. MicClipReader decides what to read (including reconstructing + // fragmented buffers); this loop is the thin Unity shell around it. Runs on the main + // thread; the native source's queue absorbs the per-frame pacing jitter. private IEnumerator CaptureLoop(AudioClip clip) { int clipFrames = clip.samples; int channels = clip.channels; int dataRate = clip.frequency > 0 ? clip.frequency : (int)DefaultMicrophoneSampleRate; - // Pre-roll: measure how fast the position counter advances (its average is steady even - // when individual values jump) and the size of its smallest discrete jump. - int prevCounter = Microphone.GetPosition(_deviceName); - long advance = 0; - long minJump = long.MaxValue; - var preRoll = System.Diagnostics.Stopwatch.StartNew(); - while (preRoll.Elapsed.TotalSeconds < PreRollSeconds) - { - if (!_capturing || _disposed) yield break; - yield return null; - int c = Microphone.GetPosition(_deviceName); - long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; - prevCounter = c; - advance += d; - if (d > 0 && d < minJump) minJump = d; - } - if (!_capturing || _disposed) yield break; - - double counterRate = advance > 0 ? advance / preRoll.Elapsed.TotalSeconds : dataRate; - double k = counterRate / dataRate; - - // Fragmented mode: the counter advances in jumps of `stride`, but only the first - // `validPerStride` samples of each stride contain data; the rest is zero padding. - bool fragmented = k > FragmentedKThreshold && minJump != long.MaxValue && minJump > 1; - int stride = fragmented ? (int)minJump : 0; - int validPerStride = fragmented ? Math.Max(1, (int)Math.Round(stride / k)) : 0; - - if (fragmented) - Utils.Info($"MicrophoneSource: fragmented clip detected (k={k:F2}); reading {validPerStride} of every {stride} samples at {dataRate}Hz"); - else - Utils.Info($"MicrophoneSource: contiguous capture (k={k:F2}) at {dataRate}Hz"); - - _resamplePos = 0.0; - _resamplePrev = 0f; - long maxBacklog = (long)(counterRate * MaxBacklogSeconds); - int readPos = prevCounter; // counter values land on jump boundaries - long pending = 0; + var reader = new MicClipReader(clipFrames, dataRate, PreRollSeconds, FragmentedKThreshold, MaxBacklogSeconds); + _resampler = new StreamingResampler(dataRate, (int)TargetSampleRate); + var ranges = new List(); + var clock = System.Diagnostics.Stopwatch.StartNew(); + bool announced = false; + long reportedDrops = 0; while (_capturing && !_disposed) { yield return null; - int c = Microphone.GetPosition(_deviceName); - long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; - prevCounter = c; - pending += d; + ranges.Clear(); + reader.Update(Microphone.GetPosition(_deviceName), clock.Elapsed.TotalSeconds, ranges); - // After a long stall, drop the oldest backlog instead of pushing a burst that - // would overrun the native source's queue. - if (pending > maxBacklog) + if (!announced && reader.Ready) { - long drop = pending - maxBacklog; - if (fragmented) drop -= drop % stride; // preserve stride alignment - readPos = (int)((readPos + drop) % clipFrames); - pending -= drop; - Utils.Warning($"MicrophoneSource: dropped {drop} buffered samples after a stall"); + announced = true; + if (reader.Fragmented) + Utils.Info($"MicrophoneSource: fragmented clip detected (k={reader.K:F2}); reading {reader.ValidPerStride} of every {reader.Stride} samples at {dataRate}Hz"); + else + Utils.Info($"MicrophoneSource: contiguous capture (k={reader.K:F2}) at {dataRate}Hz"); } - if (fragmented) - { - while (pending >= stride) - { - EmitClipRange(clip, channels, dataRate, readPos, validPerStride, clipFrames); - readPos = (readPos + stride) % clipFrames; - pending -= stride; - } - } - else if (pending > 0) + if (reader.TotalDropped > reportedDrops) { - EmitClipRange(clip, channels, dataRate, readPos, (int)pending, clipFrames); - readPos = (int)((readPos + pending) % clipFrames); - pending = 0; + Utils.Warning($"MicrophoneSource: dropped {reader.TotalDropped - reportedDrops} buffered samples after a stall"); + reportedDrops = reader.TotalDropped; } - } - } - // Reads `count` frames starting at `start`, splitting at the ring wrap so each GetData - // read is contiguous. - private void EmitClipRange(AudioClip clip, int channels, int dataRate, int start, int count, int clipFrames) - { - if (count <= 0) return; - int first = Math.Min(count, clipFrames - start); - ReadAndPush(clip, channels, dataRate, start, first); - if (count > first) - ReadAndPush(clip, channels, dataRate, 0, count - first); + for (int i = 0; i < ranges.Count; i++) + ReadAndPush(clip, channels, ranges[i].Start, ranges[i].Count); + } } - // Reads a contiguous range, downmixes to mono, resamples dataRate -> TargetSampleRate - // (streaming linear interpolation carrying state across calls, so fragment junctions stay + // Reads a contiguous range, downmixes to mono, resamples clip.frequency -> + // TargetSampleRate (the resampler carries state across calls, so fragment junctions stay // continuous), and fires AudioRead. - private void ReadAndPush(AudioClip clip, int channels, int dataRate, int start, int count) + private void ReadAndPush(AudioClip clip, int channels, int start, int count) { if (count <= 0) return; @@ -269,26 +210,9 @@ private void ReadAndPush(AudioClip clip, int channels, int dataRate, int start, } } - double step = (double)dataRate / TargetSampleRate; - var output = new List((int)(count / step) + 2); - - // Index -1 maps to the carried last sample of the previous chunk so interpolation is - // continuous across chunk boundaries. pos stays >= -1. - double pos = _resamplePos; - while (pos < count - 1) - { - int i0 = (int)Math.Floor(pos); - float a = i0 < 0 ? _resamplePrev : mono[i0]; - float b = mono[i0 + 1]; - float frac = (float)(pos - i0); - output.Add(a * (1f - frac) + b * frac); - pos += step; - } - _resamplePrev = mono[count - 1]; - _resamplePos = pos - count; - - if (output.Count > 0) - AudioRead?.Invoke(output.ToArray(), 1, (int)TargetSampleRate); + var output = _resampler.Process(mono, count); + if (output.Length > 0) + AudioRead?.Invoke(output, 1, (int)TargetSampleRate); } /// diff --git a/Tests/EditMode/MicClipReaderTests.cs b/Tests/EditMode/MicClipReaderTests.cs new file mode 100644 index 00000000..54b3a534 --- /dev/null +++ b/Tests/EditMode/MicClipReaderTests.cs @@ -0,0 +1,243 @@ +using System; +using System.Collections.Generic; +using NUnit.Framework; +using LiveKit.Internal; + +namespace LiveKit.EditModeTests +{ + /// + /// Tests for the microphone clip reading logic, including reconstruction of the fragmented + /// buffers produced by macOS with Bluetooth HFP headsets (valid fragments of 320 samples at a + /// 1024-sample stride with zero padding, position counter inflated k=3.2x; structure taken + /// from a raw buffer dump of a Sony MDR-1000X). + /// + public class MicClipReaderTests + { + const double PreRoll = 0.3; + + static List Drain(MicClipReader reader, int counter, double t) + { + var ranges = new List(); + reader.Update(counter, t, ranges); + return ranges; + } + + // Runs the pre-roll with the given advance per tick, returning (counter, time) at the end. + static (int counter, double t) RunPreRoll(MicClipReader reader, int clipFrames, int advancePerTick, double dt) + { + int counter = 0; + double t = 0; + reader.Update(counter, t, new List()); + while (!reader.Ready) + { + t += dt; + counter = (counter + advancePerTick) % clipFrames; + reader.Update(counter, t, new List()); + } + return (counter, t); + } + + [Test] + public void HealthyDevice_UsesContiguousMode_AndEmitsAllSamples() + { + const int clipFrames = 96000; // 2s @ 48k + const int rate = 48000; + const int perTick = 480; // 10ms ticks at the data rate + const double dt = 0.01; + + var reader = new MicClipReader(clipFrames, rate, PreRoll); + var (counter, t) = RunPreRoll(reader, clipFrames, perTick, dt); + + Assert.IsFalse(reader.Fragmented); + Assert.AreEqual(1.0, reader.K, 0.02); + + long emitted = 0; + for (int i = 0; i < 100; i++) + { + t += dt; + counter = (counter + perTick) % clipFrames; + foreach (var r in Drain(reader, counter, t)) + { + Assert.LessOrEqual(r.Start + r.Count, clipFrames, "range must not cross the ring wrap"); + emitted += r.Count; + } + } + Assert.AreEqual(100L * perTick, emitted, "contiguous mode must emit every written sample"); + } + + [Test] + public void FragmentedDevice_DetectsStrideAndValidCount() + { + const int clipFrames = 32000; // 2s @ 16k + const int rate = 16000; + const int stride = 1024; // one counter jump per real 20ms packet + const double dt = 0.02; + + var reader = new MicClipReader(clipFrames, rate, PreRoll); + RunPreRoll(reader, clipFrames, stride, dt); + + Assert.IsTrue(reader.Fragmented); + Assert.AreEqual(3.2, reader.K, 0.05); + Assert.AreEqual(stride, reader.Stride); + Assert.AreEqual(320, reader.ValidPerStride); + } + + [Test] + public void FragmentedDevice_ReconstructsContiguousStream() + { + const int clipFrames = 32000; + const int rate = 16000; + const int stride = 1024; + const int valid = 320; + const double dt = 0.02; + + var reader = new MicClipReader(clipFrames, rate, PreRoll); + + // Simulated clip: each tick the writer stores `valid` sequential marker values at the + // counter's previous position and zero-fills the rest of the stride, exactly like the + // dumped MDR-1000X buffer. + var clip = new float[clipFrames]; + float marker = 1f; + int counter = 0; + double t = 0; + reader.Update(counter, t, new List()); + + void WriteFragment() + { + for (int i = 0; i < stride; i++) + clip[(counter + i) % clipFrames] = i < valid ? marker + i : 0f; + marker += valid; + counter = (counter + stride) % clipFrames; + } + + while (!reader.Ready) + { + t += dt; + WriteFragment(); + reader.Update(counter, t, new List()); + } + + // Capture for several buffer laps and verify the emitted stream is the unbroken + // marker sequence: lossless reconstruction with no gaps, repeats, or padding. + var collected = new List(); + for (int tick = 0; tick < 200; tick++) + { + t += dt; + WriteFragment(); + foreach (var r in Drain(reader, counter, t)) + { + Assert.LessOrEqual(r.Start + r.Count, clipFrames, "range must not cross the ring wrap"); + for (int i = 0; i < r.Count; i++) + collected.Add(clip[r.Start + i]); + } + } + + Assert.AreEqual(200 * valid, collected.Count, "every valid fragment must be emitted exactly once"); + for (int i = 1; i < collected.Count; i++) + Assert.AreEqual(collected[i - 1] + 1f, collected[i], $"stream must be contiguous at index {i}"); + } + + [Test] + public void FragmentedDevice_DropsStaleBacklogStrideAligned() + { + const int clipFrames = 32000; + const int rate = 16000; + const int stride = 1024; + const double dt = 0.02; + const double maxBacklogSec = 0.2; + + var reader = new MicClipReader(clipFrames, rate, PreRoll, 1.05, maxBacklogSec); + var (counter, t) = RunPreRoll(reader, clipFrames, stride, dt); + + // One giant advance (a main-thread stall): 25 strides at once. + const int stalledStrides = 25; + counter = (counter + stalledStrides * stride) % clipFrames; + t += stalledStrides * dt; + var ranges = Drain(reader, counter, t); + + Assert.Greater(reader.TotalDropped, 0, "stall backlog must be dropped"); + Assert.AreEqual(0, reader.TotalDropped % stride, "drop must preserve stride alignment"); + + // Emitted + dropped must account for the whole advance (in counter units). + long emittedStrides = 0; + foreach (var r in ranges) emittedStrides += r.Count; + emittedStrides /= reader.ValidPerStride; + Assert.AreEqual(stalledStrides, emittedStrides + reader.TotalDropped / stride); + + // The bounded burst must not exceed the backlog limit. + Assert.LessOrEqual(emittedStrides * stride, (long)(reader.CounterRate * maxBacklogSec)); + } + + [Test] + public void NoRangesAreEmittedDuringPreRoll() + { + const int clipFrames = 96000; + var reader = new MicClipReader(clipFrames, 48000, PreRoll); + var ranges = new List(); + reader.Update(0, 0.0, ranges); + reader.Update(480, 0.01, ranges); + reader.Update(960, 0.02, ranges); + Assert.IsFalse(reader.Ready); + Assert.IsEmpty(ranges); + } + } + + public class StreamingResamplerTests + { + static float[] Sine(int count, double freqHz, int rate) + { + var s = new float[count]; + for (int i = 0; i < count; i++) + s[i] = (float)Math.Sin(2.0 * Math.PI * freqHz * i / rate); + return s; + } + + static int ZeroCrossings(IReadOnlyList s) + { + int n = 0; + for (int i = 1; i < s.Count; i++) + if ((s[i - 1] < 0f) != (s[i] < 0f)) n++; + return n; + } + + [Test] + public void Upsample16kTo48k_PreservesFrequencyAndLength() + { + const int inRate = 16000, outRate = 48000; + var input = Sine(16000, 200.0, inRate); // 1s of 200Hz + var resampler = new StreamingResampler(inRate, outRate); + var output = resampler.Process(input, input.Length); + + Assert.AreEqual(outRate, output.Length, outRate / 100, "1s in should be ~1s out at the new rate"); + // 200Hz over ~1s crosses zero ~400 times regardless of sample rate. + Assert.AreEqual(ZeroCrossings(input), ZeroCrossings(output), 4); + } + + [Test] + public void ChunkedProcessing_MatchesWholeProcessing() + { + const int inRate = 16000, outRate = 48000; + var input = Sine(3200, 250.0, inRate); + + var whole = new StreamingResampler(inRate, outRate).Process(input, input.Length); + + // Process the same stream in 320-sample fragments (the MDR-1000X packet size). + var chunked = new List(); + var resampler = new StreamingResampler(inRate, outRate); + for (int off = 0; off < input.Length; off += 320) + { + var chunk = new float[320]; + Array.Copy(input, off, chunk, 0, 320); + chunked.AddRange(resampler.Process(chunk, 320)); + } + + // Accumulated floating-point rounding differs by an ulp between the two paths (the + // chunked position is renormalized per chunk), which can flip the final boundary + // sample — allow a 1-sample tail difference, but the overlap must match exactly. + Assert.AreEqual(whole.Length, chunked.Count, 1, "chunking must not change the output length (±1 tail sample)"); + int overlap = Math.Min(whole.Length, chunked.Count); + for (int i = 0; i < overlap; i++) + Assert.AreEqual(whole[i], chunked[i], 1e-4f, $"chunked output diverges at {i}"); + } + } +} From d92df2d68f2a9be0c3be71b81870ef1a53c14aa6 Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:49:48 +0200 Subject: [PATCH 3/5] Fix MissingComponentException when unpublishing the microphone MicrophoneSource no longer attaches an AudioSource to its GameObject (it reads the mic clip directly), but the Meet sample still called GetComponent()?.Stop() on unpublish. The ?. operator bypasses Unity's overloaded null-check on the editor's missing-component stub, so Stop() ran on the stub and threw MissingComponentException. Remove the obsolete call. Co-Authored-By: Claude Fable 5 --- Samples~/Meet/Assets/Runtime/MeetManager.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Samples~/Meet/Assets/Runtime/MeetManager.cs b/Samples~/Meet/Assets/Runtime/MeetManager.cs index 97b2cb70..baa70837 100644 --- a/Samples~/Meet/Assets/Runtime/MeetManager.cs +++ b/Samples~/Meet/Assets/Runtime/MeetManager.cs @@ -487,7 +487,7 @@ private void UnpublishLocalMicrophone() if (_audioObjects.TryGetValue(LocalAudioTrackName, out var obj)) { - obj.GetComponent()?.Stop(); + // MicrophoneSource reads the mic clip directly; no AudioSource is attached anymore. Destroy(obj); _audioObjects.Remove(LocalAudioTrackName); } From 9b63c7556b35f2539ce3001602036a12020a40ad Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:19:55 +0200 Subject: [PATCH 4/5] Prevent false fragmented-mode detection; fix remaining AudioSource cleanup Field testing device transitions surfaced a false positive: right after recovering onto the healthy MacBook microphone, the pre-roll measured k=1.07 (counter startup burst while driver buffers flush) which crossed the old 1.05 threshold and engaged fragmented mode - silently discarding ~6% of real audio (heard as choppiness) until the next re-measurement. Engaging fragmented mode discards (stride - valid) samples per stride, so a false positive guarantees audio loss while a false negative only risks mild artifacts. Fix both sides of the measurement: - Raise the fragmented threshold from 1.05 to 1.5: the observed pathological device measures k=3.2, healthy devices ~1.0 plus a few percent of noise - keep a wide margin between the two. - Add a 100ms settle window that discards the counter's startup burst before the rate measurement begins. Add a regression test for the borderline case (k=1.07 must stay contiguous). Also fix the second AudioSource null-propagation site (CleanUpAllTracks via OnDestroy) with TryGetComponent - same MissingComponentException class as the unpublish path, hit because the local mic object no longer carries an AudioSource. Co-Authored-By: Claude Fable 5 --- Runtime/Scripts/Internal/MicClipReader.cs | 25 +++++++++++++++++---- Runtime/Scripts/MicrophoneSource.cs | 9 ++++++-- Samples~/Meet/Assets/Runtime/MeetManager.cs | 5 ++++- Tests/EditMode/MicClipReaderTests.cs | 18 +++++++++++++++ 4 files changed, 50 insertions(+), 7 deletions(-) diff --git a/Runtime/Scripts/Internal/MicClipReader.cs b/Runtime/Scripts/Internal/MicClipReader.cs index cb04b9fa..99a49bbb 100644 --- a/Runtime/Scripts/Internal/MicClipReader.cs +++ b/Runtime/Scripts/Internal/MicClipReader.cs @@ -30,10 +30,12 @@ public struct ReadRange private readonly double _preRollSeconds; private readonly double _fragmentedKThreshold; private readonly double _maxBacklogSeconds; + private readonly double _settleSeconds; private bool _hasFirstSample; private int _prevCounter; - private double _preRollStart; + private double _firstSampleTime; + private double _measureStart = double.NaN; private long _preRollAdvance; private long _minJump = long.MaxValue; @@ -63,7 +65,8 @@ public struct ReadRange public long TotalDropped { get; private set; } public MicClipReader(int clipFrames, int dataRate, - double preRollSeconds = 0.3, double fragmentedKThreshold = 1.05, double maxBacklogSeconds = 0.2) + double preRollSeconds = 0.3, double fragmentedKThreshold = 1.5, double maxBacklogSeconds = 0.2, + double settleSeconds = 0.1) { if (clipFrames <= 0) throw new ArgumentOutOfRangeException(nameof(clipFrames)); if (dataRate <= 0) throw new ArgumentOutOfRangeException(nameof(dataRate)); @@ -72,6 +75,7 @@ public MicClipReader(int clipFrames, int dataRate, _preRollSeconds = preRollSeconds; _fragmentedKThreshold = fragmentedKThreshold; _maxBacklogSeconds = maxBacklogSeconds; + _settleSeconds = settleSeconds; } /// @@ -84,7 +88,7 @@ public void Update(int counterPosition, double elapsedSeconds, List r { _hasFirstSample = true; _prevCounter = counterPosition; - _preRollStart = elapsedSeconds; + _firstSampleTime = elapsedSeconds; return; } @@ -93,9 +97,22 @@ public void Update(int counterPosition, double elapsedSeconds, List r if (!Ready) { + // Discard the settle window entirely: right after a device starts, the counter can + // burst ahead while driver buffers flush, which would inflate the measured rate + // (observed: a healthy device measuring k=1.07 right after a device transition). + if (elapsedSeconds - _firstSampleTime < _settleSeconds) + return; + if (double.IsNaN(_measureStart)) + { + // Anchor the measurement window here; the delta spanning the settle boundary + // is discarded with the settle period. + _measureStart = elapsedSeconds; + return; + } + _preRollAdvance += d; if (d > 0 && d < _minJump) _minJump = d; - double window = elapsedSeconds - _preRollStart; + double window = elapsedSeconds - _measureStart; if (window >= _preRollSeconds) FinishPreRoll(window); return; diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index 89f3f68f..b424e2a5 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -26,7 +26,12 @@ sealed public class MicrophoneSource : RtcAudioSource // captured samples are resampled from clip.frequency to the fixed native-source rate. private const uint TargetSampleRate = 48000; private const float PreRollSeconds = 0.3f; - private const double FragmentedKThreshold = 1.05; + private const float SettleSeconds = 0.1f; // discard the counter's startup burst before measuring + // Engaging fragmented mode discards (stride - valid) samples per stride, so a false + // positive guarantees audio loss while a false negative only risks mild artifacts. The + // observed pathological device measures k=3.2; healthy devices measure ~1.0 with up to a + // few percent of startup noise. Keep a wide margin between the two. + private const double FragmentedKThreshold = 1.5; private const float MaxBacklogSeconds = 0.2f; // drop backlog beyond this after a stall private readonly string _deviceName; @@ -149,7 +154,7 @@ private IEnumerator CaptureLoop(AudioClip clip) int channels = clip.channels; int dataRate = clip.frequency > 0 ? clip.frequency : (int)DefaultMicrophoneSampleRate; - var reader = new MicClipReader(clipFrames, dataRate, PreRollSeconds, FragmentedKThreshold, MaxBacklogSeconds); + var reader = new MicClipReader(clipFrames, dataRate, PreRollSeconds, FragmentedKThreshold, MaxBacklogSeconds, SettleSeconds); _resampler = new StreamingResampler(dataRate, (int)TargetSampleRate); var ranges = new List(); var clock = System.Diagnostics.Stopwatch.StartNew(); diff --git a/Samples~/Meet/Assets/Runtime/MeetManager.cs b/Samples~/Meet/Assets/Runtime/MeetManager.cs index baa70837..532aa319 100644 --- a/Samples~/Meet/Assets/Runtime/MeetManager.cs +++ b/Samples~/Meet/Assets/Runtime/MeetManager.cs @@ -566,7 +566,10 @@ private void CleanUpAllTracks() foreach (var obj in _audioObjects.Values) { if (obj == null) continue; - obj.GetComponent()?.Stop(); + // Not every audio object has an AudioSource (the local mic object no longer does), and + // ?. on GetComponent bypasses Unity's missing-component null semantics in the editor. + if (obj.TryGetComponent(out var audioSource)) + audioSource.Stop(); Destroy(obj); } _audioObjects.Clear(); diff --git a/Tests/EditMode/MicClipReaderTests.cs b/Tests/EditMode/MicClipReaderTests.cs index 54b3a534..36a563d5 100644 --- a/Tests/EditMode/MicClipReaderTests.cs +++ b/Tests/EditMode/MicClipReaderTests.cs @@ -168,6 +168,24 @@ public void FragmentedDevice_DropsStaleBacklogStrideAligned() Assert.LessOrEqual(emittedStrides * stride, (long)(reader.CounterRate * maxBacklogSec)); } + [Test] + public void SlightlyInflatedCounter_StaysContiguous() + { + // Regression: a healthy MacBook mic measured k=1.07 right after a device transition + // (startup-burst noise), and the old 1.05 threshold engaged fragmented mode, silently + // discarding ~6% of real audio. Borderline rates must stay contiguous. + const int clipFrames = 96000; + const int rate = 48000; + const int perTick = 514; // ~k=1.07 at 10ms ticks + const double dt = 0.01; + + var reader = new MicClipReader(clipFrames, rate, PreRoll); + RunPreRoll(reader, clipFrames, perTick, dt); + + Assert.IsFalse(reader.Fragmented, "k slightly above 1 must not trigger fragmented mode"); + Assert.AreEqual(1.07, reader.K, 0.02); + } + [Test] public void NoRangesAreEmittedDuringPreRoll() { From 457ce3fbe4f2a5f6c284241100bd8701ea20ddb9 Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:22:03 +0200 Subject: [PATCH 5/5] Add Unity meta files for the new scripts Generated by the editor; required for stable GUIDs when the package is imported. Co-Authored-By: Claude Fable 5 --- Runtime/Scripts/Internal/MicClipReader.cs.meta | 11 +++++++++++ Runtime/Scripts/Internal/StreamingResampler.cs.meta | 11 +++++++++++ Tests/EditMode/MicClipReaderTests.cs.meta | 11 +++++++++++ 3 files changed, 33 insertions(+) create mode 100644 Runtime/Scripts/Internal/MicClipReader.cs.meta create mode 100644 Runtime/Scripts/Internal/StreamingResampler.cs.meta create mode 100644 Tests/EditMode/MicClipReaderTests.cs.meta diff --git a/Runtime/Scripts/Internal/MicClipReader.cs.meta b/Runtime/Scripts/Internal/MicClipReader.cs.meta new file mode 100644 index 00000000..88aa56bd --- /dev/null +++ b/Runtime/Scripts/Internal/MicClipReader.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: d0ae29390ef914aa6b62ae81c9b4f212 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Runtime/Scripts/Internal/StreamingResampler.cs.meta b/Runtime/Scripts/Internal/StreamingResampler.cs.meta new file mode 100644 index 00000000..26d7c37c --- /dev/null +++ b/Runtime/Scripts/Internal/StreamingResampler.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 967338b84cfb74bdebca9132f3b9abd0 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Tests/EditMode/MicClipReaderTests.cs.meta b/Tests/EditMode/MicClipReaderTests.cs.meta new file mode 100644 index 00000000..70af710c --- /dev/null +++ b/Tests/EditMode/MicClipReaderTests.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: b8df68a85510e4aa58359a4dd8b170c6 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: