From 211000084e0179d7e087c31a7214d72ac8b65bb0 Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:45:54 +0200 Subject: [PATCH 1/7] Configure native audio source from device, not hardcoded defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The native (Rust) audio source was created with a hardcoded sample rate (48000) and channel count (2). Microphone frames flow through Unity's audio graph (AudioProbe) at the actual DSP output configuration, which often differs — e.g. with a Bluetooth headset. The Rust source does not resample; it rejects frames whose rate/channels don't match, causing the metadata-mismatch warning and capture failures. Read the source's sample rate and channel count from Unity's output configuration (AudioSettings.GetConfiguration) instead of hardcoded defaults, falling back to the defaults only when Unity can't report one. The base constructor now exposes a device-mode overload (type only) and an explicit overload (type, sampleRate, channels) for sources that generate a fixed format. MicrophoneSource and BasicAudioSource use device mode; BasicAudioSource drops its unused channels parameter. SineWaveAudioSource declares its exact format. If a frame's format still doesn't match (inconsistent Unity report or a runtime output change), drop it with a throttled warning instead of sending a mismatch the native side would error on. Also removes the redundant Microphone.Start in the Meet sample. Co-Authored-By: Claude Opus 4.8 (1M context) --- Runtime/Scripts/BasicAudioSource.cs | 6 +- Runtime/Scripts/MicrophoneSource.cs | 2 +- Runtime/Scripts/RtcAudioSource.cs | 81 ++++++++++++++++++--- Samples~/Meet/Assets/Runtime/MeetManager.cs | 3 +- Tests/PlayMode/Utils/SineWaveAudioSource.cs | 2 +- 5 files changed, 79 insertions(+), 15 deletions(-) diff --git a/Runtime/Scripts/BasicAudioSource.cs b/Runtime/Scripts/BasicAudioSource.cs index 3b63680b..8193090d 100644 --- a/Runtime/Scripts/BasicAudioSource.cs +++ b/Runtime/Scripts/BasicAudioSource.cs @@ -19,9 +19,11 @@ sealed public class BasicAudioSource : RtcAudioSource /// Creates a new basic audio source for the given in the scene. /// /// The to capture from. - /// The number of channels to capture. /// The type of audio source. - public BasicAudioSource(AudioSource source, int channels = 2, RtcAudioSourceType sourceType = RtcAudioSourceType.AudioSourceCustom) : base(channels, sourceType) + /// + /// The sample rate and channel count are taken from Unity's audio configuration. + /// + public BasicAudioSource(AudioSource source, RtcAudioSourceType sourceType = RtcAudioSourceType.AudioSourceCustom) : base(sourceType) { _source = source; } diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index 904b8da7..a8775568 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -28,7 +28,7 @@ sealed public class MicrophoneSource : RtcAudioSource /// get the list of available devices. /// The GameObject to attach the AudioSource to. The object must be kept in the scene /// for the duration of the source's lifetime. - public MicrophoneSource(string deviceName, GameObject sourceObject) : base(2, RtcAudioSourceType.AudioSourceMicrophone) + public MicrophoneSource(string deviceName, GameObject sourceObject) : base(RtcAudioSourceType.AudioSourceMicrophone) { _deviceName = deviceName; _sourceObject = sourceObject; diff --git a/Runtime/Scripts/RtcAudioSource.cs b/Runtime/Scripts/RtcAudioSource.cs index a9af8a0a..7f5c3d7c 100644 --- a/Runtime/Scripts/RtcAudioSource.cs +++ b/Runtime/Scripts/RtcAudioSource.cs @@ -83,20 +83,33 @@ private sealed class PendingAudioFrame private volatile bool _disposed = false; private int _audioReadCount = 0; - protected RtcAudioSource(int channels = 2, RtcAudioSourceType audioSourceType = RtcAudioSourceType.AudioSourceCustom) + // Device-capture sources (microphone, AudioSource taps) don't know their format ahead of + // time — it is whatever Unity's audio graph delivers. They use this constructor, which + // configures the native source from Unity's current output configuration. + protected RtcAudioSource(RtcAudioSourceType audioSourceType) + : this(audioSourceType, 0, 0) { } + + // Sources that generate a fixed, known format (e.g. test signal generators) declare it + // directly. Passing 0 for either value falls back to the device configuration. + protected RtcAudioSource(RtcAudioSourceType audioSourceType, uint sampleRate, uint channels) { _sourceType = audioSourceType; - _expectedChannels = (uint)channels; + + if (sampleRate > 0 && channels > 0) + { + _expectedSampleRate = sampleRate; + _expectedChannels = channels; + } + else + { + (_expectedSampleRate, _expectedChannels) = ResolveDeviceFormat(); + } using var request = FFIBridge.Instance.NewRequest(); var newAudioSource = request.request; newAudioSource.Type = AudioSourceType.AudioSourceNative; - newAudioSource.NumChannels = (uint)channels; - newAudioSource.SampleRate = _sourceType == RtcAudioSourceType.AudioSourceMicrophone ? - DefaultMicrophoneSampleRate : DefaultSampleRate; - _expectedSampleRate = newAudioSource.SampleRate; - - Utils.Debug($"NewAudioSource: {newAudioSource.NumChannels} {newAudioSource.SampleRate}"); + newAudioSource.NumChannels = _expectedChannels; + newAudioSource.SampleRate = _expectedSampleRate; newAudioSource.Options = request.TempResource(); newAudioSource.Options.EchoCancellation = true; @@ -109,6 +122,49 @@ protected RtcAudioSource(int channels = 2, RtcAudioSourceType audioSourceType = Utils.Debug($"{DebugTag} created handle={Handle.DangerousGetHandle()} expectedRate={_expectedSampleRate} expectedChannels={_expectedChannels} sourceType={_sourceType}"); } + // Reads Unity's actual output audio configuration. The capture path delivers buffers at the + // DSP output rate/channel count (see AudioProbe), so this is the format the native source + // must match. Falls back to the platform defaults when Unity cannot report a configuration + // (e.g. batch mode without an audio device). + private (uint sampleRate, uint channels) ResolveDeviceFormat() + { + uint sampleRate = _sourceType == RtcAudioSourceType.AudioSourceMicrophone + ? DefaultMicrophoneSampleRate + : DefaultSampleRate; + uint channels = DefaultChannels; + + try + { + var config = UnityEngine.AudioSettings.GetConfiguration(); + if (config.sampleRate > 0) + sampleRate = (uint)config.sampleRate; + var configuredChannels = SpeakerModeChannels(config.speakerMode); + if (configuredChannels > 0) + channels = configuredChannels; + } + catch (Exception e) + { + Utils.Warning($"{DebugTag} could not read Unity audio configuration, using defaults: {e.Message}"); + } + + return (sampleRate, channels); + } + + private static uint SpeakerModeChannels(UnityEngine.AudioSpeakerMode mode) + { + switch (mode) + { + case UnityEngine.AudioSpeakerMode.Mono: return 1; + case UnityEngine.AudioSpeakerMode.Stereo: return 2; + case UnityEngine.AudioSpeakerMode.Quad: return 4; + case UnityEngine.AudioSpeakerMode.Surround: return 5; + case UnityEngine.AudioSpeakerMode.Mode5point1: return 6; + case UnityEngine.AudioSpeakerMode.Mode7point1: return 8; + case UnityEngine.AudioSpeakerMode.Prologic: return 2; + default: return 0; + } + } + /// /// Begin capturing audio samples from the underlying source. /// @@ -153,9 +209,16 @@ private void OnAudioRead(float[] data, int channels, int sampleRate) return; } + // The native source rejects frames whose rate/channels differ from how it was + // configured (it does not resample). This should not happen now that the source is + // configured from the device, but if Unity reports an inconsistent format — or the + // output configuration changes at runtime — we drop the frame instead of sending a + // mismatch the native side would error on. if ((uint)sampleRate != _expectedSampleRate || (uint)channels != _expectedChannels) { - Utils.Warning($"{DebugTag} audio frame #{frameIndex} metadata mismatch actualRate={sampleRate} actualChannels={channels} expectedRate={_expectedSampleRate} expectedChannels={_expectedChannels} sourceType={_sourceType}"); + if (frameIndex == 1 || frameIndex % 100 == 0) + Utils.Warning($"{DebugTag} dropping audio frame #{frameIndex}: format {sampleRate}/{channels} does not match source {_expectedSampleRate}/{_expectedChannels} (sourceType={_sourceType})"); + return; } var pendingBeforeSend = PendingFrameCount(); diff --git a/Samples~/Meet/Assets/Runtime/MeetManager.cs b/Samples~/Meet/Assets/Runtime/MeetManager.cs index 225c7a0c..97b2cb70 100644 --- a/Samples~/Meet/Assets/Runtime/MeetManager.cs +++ b/Samples~/Meet/Assets/Runtime/MeetManager.cs @@ -453,8 +453,7 @@ private IEnumerator PublishLocalMicrophone() { if (_audioObjects.ContainsKey(LocalAudioTrackName)) yield break; - Microphone.Start(null, true, 10, 44100); - + // MicrophoneSource starts the device itself, so we only need the device name here. var audioObject = new GameObject($"My Microphone: {Microphone.devices[0]}"); audioObject.transform.SetParent(_audioTrackParent); diff --git a/Tests/PlayMode/Utils/SineWaveAudioSource.cs b/Tests/PlayMode/Utils/SineWaveAudioSource.cs index 907e9ccc..2337615b 100644 --- a/Tests/PlayMode/Utils/SineWaveAudioSource.cs +++ b/Tests/PlayMode/Utils/SineWaveAudioSource.cs @@ -31,7 +31,7 @@ public SineWaveAudioSource( int sampleRate = 48000, double frequencyHz = 440.0, float amplitude = 0.1f) - : base(channels, RtcAudioSourceType.AudioSourceCustom) + : base(RtcAudioSourceType.AudioSourceCustom, (uint)sampleRate, (uint)channels) { _channels = channels; _sampleRate = sampleRate; From 834f2047ccf7b8a0a848b8b0518931462256ad4c Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 10:12:35 +0200 Subject: [PATCH 2/7] Add throttled capture/receive rate diagnostics (Info level) Temporary, ~2s-throttled diagnostics to investigate choppy received audio: - RtcAudioSource logs the effective capture sample rate (samples/sec by wall clock) vs the rate declared to the native source. A measured rate that differs from the declared rate means the frame format label is wrong, which would sound fast/slow/choppy on the receiver. - AudioStream logs buffer fill, underrun count, callback count and frames received, to distinguish receive-side starvation from a clean stream. Emitted via Utils.Info so they appear without LK_DEBUG (Utils.Debug is compiled out unless LK_DEBUG is defined). Co-Authored-By: Claude Opus 4.8 (1M context) --- Runtime/Scripts/AudioStream.cs | 31 ++++++++++++++++++++++++++++ Runtime/Scripts/RtcAudioSource.cs | 34 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/Runtime/Scripts/AudioStream.cs b/Runtime/Scripts/AudioStream.cs index 4d9ab588..3288f839 100644 --- a/Runtime/Scripts/AudioStream.cs +++ b/Runtime/Scripts/AudioStream.cs @@ -50,6 +50,14 @@ public sealed class AudioStream : IDisposable private const int CrossfadeFrames = 128; // ~2.7ms @ 48kHz private int _skipCooldown = 0; + // --- Temporary receive diagnostics (Info level, emitted ~every 2s) --- + // Reveals whether choppiness is a buffer-starvation problem (underruns/low fill) versus a + // clean stream, and what rate/channels we are actually playing/requesting. + private long _diagWindowStartTicks; + private int _diagCallbacks; + private int _diagUnderruns; + private int _diagFramesReceived; + /// /// Creates a new audio stream from a remote audio track, attaching it to the /// given in the scene. @@ -147,6 +155,8 @@ private void OnAudioRead(float[] data, int channels, int sampleRate) lock (_lock) { + MaybeLogReceiveDiagnostics(channels, sampleRate); + // Single gate covering first-create and runtime format changes (e.g. after a // system audio device switch). When the FFI stream is missing or what we asked // Rust for no longer matches what Unity is delivering, post a (re)create to the @@ -214,6 +224,7 @@ static float S16ToFloat(short v) if (valuesAvailableToRead < data.Length) { _isPrimed = false; + _diagUnderruns++; Utils.Debug($"AudioStream underrun detected, re-priming (got {valuesAvailableToRead} samples but want to read {data.Length})"); // Output silence immediately instead of playing partial/choppy samples. @@ -370,6 +381,7 @@ private void OnAudioStreamEvent(AudioStreamEvent e) var data = new ReadOnlySpan(frame.Data.ToPointer(), frame.Length); _buffer.Write(data); } + _diagFramesReceived++; } } @@ -427,6 +439,25 @@ private void Dispose(bool disposing) Dispose(false); } + // Temporary diagnostic: ~every 2s logs buffer fill, underrun count, callback count and + // frames received so we can tell starvation (choppy) from a clean stream. Called under _lock. + private void MaybeLogReceiveDiagnostics(int channels, int sampleRate) + { + _diagCallbacks++; + var now = System.Diagnostics.Stopwatch.GetTimestamp(); + if (_diagWindowStartTicks == 0) _diagWindowStartTicks = now; + var elapsed = (now - _diagWindowStartTicks) / (double)System.Diagnostics.Stopwatch.Frequency; + if (elapsed < 2.0) return; + + float fill = _buffer != null ? _buffer.AvailableReadInPercent() : 0f; + Utils.Info($"AudioStream#{_trackHandleId} diag: out={sampleRate}Hz/{channels}ch ffi={_ffiSampleRate}Hz/{_ffiNumChannels}ch " + + $"bufferFill={fill * 100f:F0}% callbacks={_diagCallbacks} underruns={_diagUnderruns} framesRecv={_diagFramesReceived} over={elapsed:F1}s"); + _diagWindowStartTicks = now; + _diagCallbacks = 0; + _diagUnderruns = 0; + _diagFramesReceived = 0; + } + // For testing and debugging internal float GetBufferFill() { diff --git a/Runtime/Scripts/RtcAudioSource.cs b/Runtime/Scripts/RtcAudioSource.cs index 7f5c3d7c..c85e5a2d 100644 --- a/Runtime/Scripts/RtcAudioSource.cs +++ b/Runtime/Scripts/RtcAudioSource.cs @@ -83,6 +83,15 @@ private sealed class PendingAudioFrame private volatile bool _disposed = false; private int _audioReadCount = 0; + // --- Temporary capture-rate diagnostics (Info level, emitted ~every 2s) --- + // Measures the effective sample rate from wall-clock time vs the rate we declared to the + // native source. A measured rate that differs from the declared rate means the format + // label on the frames is wrong (audio would sound fast/slow/choppy on the receiver). + private long _diagWindowStartTicks; // 0 = not started + private long _diagSamplesPerChannel; + private int _diagAcceptedFrames; + private int _diagDroppedFrames; + // Device-capture sources (microphone, AudioSource taps) don't know their format ahead of // time — it is whatever Unity's audio graph delivers. They use this constructor, which // configures the native source from Unity's current output configuration. @@ -209,6 +218,9 @@ private void OnAudioRead(float[] data, int channels, int sampleRate) return; } + var willDrop = (uint)sampleRate != _expectedSampleRate || (uint)channels != _expectedChannels; + RecordCaptureDiagnostics(data.Length / channels, channels, sampleRate, willDrop); + // The native source rejects frames whose rate/channels differ from how it was // configured (it does not resample). This should not happen now that the source is // configured from the device, but if Unity reports an inconsistent format — or the @@ -405,6 +417,28 @@ private static double ElapsedMilliseconds(long startedTimestamp) return (Stopwatch.GetTimestamp() - startedTimestamp) * 1000.0 / Stopwatch.Frequency; } + // Temporary diagnostic: accumulates captured audio over wall-clock time and, ~every 2s, + // logs the effective sample rate vs the rate declared to the native source. Runs on the + // audio thread; the periodic Info log is cheap. + private void RecordCaptureDiagnostics(int samplesPerChannel, int channels, int sampleRate, bool dropped) + { + var now = Stopwatch.GetTimestamp(); + if (_diagWindowStartTicks == 0) _diagWindowStartTicks = now; + _diagSamplesPerChannel += samplesPerChannel; + if (dropped) _diagDroppedFrames++; else _diagAcceptedFrames++; + + var elapsed = (now - _diagWindowStartTicks) / (double)Stopwatch.Frequency; + if (elapsed < 2.0) return; + + var measuredRate = _diagSamplesPerChannel / elapsed; + Utils.Info($"{DebugTag} capture diag: declared={_expectedSampleRate}Hz/{_expectedChannels}ch measuredRate={measuredRate:F0}Hz " + + $"lastFrame={samplesPerChannel}smp/{channels}ch/{sampleRate}Hz accepted={_diagAcceptedFrames} dropped={_diagDroppedFrames} over={elapsed:F1}s"); + _diagWindowStartTicks = now; + _diagSamplesPerChannel = 0; + _diagAcceptedFrames = 0; + _diagDroppedFrames = 0; + } + private string DebugTag => $"RtcAudioSource#{_debugId}"; } } From a775e593e899b0f9d6bee902ab7876138160f4bf Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:07:23 +0200 Subject: [PATCH 3/7] Open microphone at the output sample rate to avoid capture drift MicrophoneSource started the device at the hardcoded DefaultMicrophoneSampleRate and played the looping clip through an AudioSource read on the DSP thread. When the device's actual rate differs from the engine output rate, the clip fills and plays back at different rates, so the read position drifts against the write position and the captured audio becomes choppy. Open the microphone at AudioSettings.outputSampleRate when the device supports it (clamped to the device's reported caps; falling back to the default when the output rate is unknown), so capture and playback run at the same rate. This also aligns the mic rate with the native source rate, which is taken from the same output configuration. Co-Authored-By: Claude Opus 4.8 (1M context) --- Runtime/Scripts/MicrophoneSource.cs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index a8775568..9a4405ac 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -59,6 +59,28 @@ public override void Start() _started = true; } + // Opens the microphone at the engine's output sample rate when the device supports it, so + // the captured clip and the AudioSource that plays it back run at the same rate. A mismatch + // makes the looping clip drift against the playback read position and produces choppy audio. + // Falls back to DefaultMicrophoneSampleRate when the output rate is unknown, and clamps to + // the device's supported range when it reports one. + private static int ResolveMicrophoneSampleRate(string deviceName) + { + int target = AudioSettings.outputSampleRate; + if (target <= 0) + target = (int)DefaultMicrophoneSampleRate; + + Microphone.GetDeviceCaps(deviceName, out int minFreq, out int maxFreq); + // Unity reports (0, 0) when the device imposes no specific sample-rate range. + if (minFreq == 0 && maxFreq == 0) + return target; + + var result = Mathf.Clamp(target, minFreq, maxFreq); + Utils.Info($"ResolveMicrophoneSampleRate: {result}"); + + return result; + } + private IEnumerator StartMicrophone() { // Validate that the GameObject is still valid before starting @@ -76,13 +98,14 @@ private IEnumerator StartMicrophone() } AudioClip clip = null; + var micFrequency = ResolveMicrophoneSampleRate(_deviceName); try { clip = Microphone.Start( _deviceName, loop: true, lengthSec: 1, - frequency: (int)DefaultMicrophoneSampleRate + frequency: micFrequency ); } catch (Exception e) @@ -97,6 +120,8 @@ private IEnumerator StartMicrophone() yield break; } + Utils.Info($"MicrophoneSource device='{_deviceName}' opened at {micFrequency}Hz (output={AudioSettings.outputSampleRate}Hz)"); + // Ensure no duplicate components exist before adding new ones. // This is important during app resume on iOS where components might not be // fully destroyed yet due to Unity's deferred Destroy(). From 4604f248b6f61c0e5a47008128fe5186510eda01 Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 15:29:47 +0200 Subject: [PATCH 4/7] Add adaptive pitch servo to lock mic playback to the capture rate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mic clip is filled by the capture device's clock while the AudioSource that plays it (feeding OnAudioFilterRead) runs on the output device's clock. Some devices also misreport the clip rate entirely: a Bluetooth headset on macOS labels its clip 16kHz while filling it at ~51kHz. Either way the read head drifts against the write head and gets lapped, which sounds like periodic chopping. Add a pacing servo that measures how fast the write head actually advances (GetPosition delta over wall clock - steady within ±0.1% even when the instantaneous position is jumpy) and continuously adjusts AudioSource.pitch so the read head consumes clip samples at the same rate, holding a fixed lag behind the writer. A short pre-roll measures the rate before playback starts so the initial pitch is already correct; the fill-rate estimate and the lag target (sized to ~4x observed jitter, bounded by clip capacity) keep adapting while capturing, and an out-of-bounds resync recovers from long hitches. In the normal case the measured rate matches clip.frequency, pitch hovers at ~1.0, and the servo is effectively a no-op. In the misreporting case pitch settles at the true ratio (~3.2), which plays the clip's real-time data at correct speed and eliminates the chop. Pitch is rate control, not a delay: the added latency is only the held lag (~80-150ms, adaptive). Co-Authored-By: Claude Fable 5 --- Runtime/Scripts/MicrophoneSource.cs | 131 +++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 1 deletion(-) diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index 9a4405ac..a8cd1c77 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -13,6 +13,23 @@ namespace LiveKit /// sealed public class MicrophoneSource : RtcAudioSource { + // --- Playback pacing servo --- + // The mic clip is filled by the capture device's clock while the AudioSource that plays it + // (feeding AudioProbe/OnAudioFilterRead) is driven by the output device's clock. Worse, + // some devices misreport the clip rate entirely (a Bluetooth headset on macOS labeled its + // clip 16kHz while filling it at ~51kHz). Either way the read head drifts against the + // write head until it gets lapped, which sounds like periodic chopping. The servo measures + // how fast the write head actually advances and continuously adjusts AudioSource.pitch so + // the read head consumes clip samples at the same rate, holding a fixed lag behind the + // writer. In the normal case the measured rate matches clip.frequency and pitch stays ~1. + private const float PreRollSeconds = 0.3f; // initial fill-rate measurement window + private const float MinTargetLagSec = 0.08f; // smallest safety lag (good devices) + private const float MaxTargetLagSec = 0.25f; // adaptive ceiling (jittery devices) + private const float PitchCorrectionGain = 0.5f; // proportional gain on relative lag error + private const float MaxRelativeCorrection = 0.2f; + private const float MinPitch = 0.25f; + private const float MaxPitch = 8f; + private readonly GameObject _sourceObject; private readonly string _deviceName; @@ -160,10 +177,122 @@ private IEnumerator StartMicrophone() yield break; } - source.Play(); + // Playback is started by the pacing servo, which first measures the clip's true fill + // rate so the initial pitch and read position are right from the first sample. + MonoBehaviourContext.RunCoroutine(PaceMicrophonePlayback(source, clip)); Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully"); } + // Keeps the AudioSource's read head locked a fixed lag behind the mic's write head by + // adjusting pitch (see the servo comment at the top of the class). Pitch is rate control, + // not a delay: the only latency this adds is the target lag itself. + private IEnumerator PaceMicrophonePlayback(AudioSource source, AudioClip clip) + { + int clipFrames = clip.samples; + int declaredRate = clip.frequency; + + // Pre-roll: measure the true fill rate before playback starts. GetPosition's + // instantaneous position can be jumpy on misbehaving devices, but its average advance + // has measured steady (±0.1%), so a short window gives a reliable rate. + int prevWrite = Microphone.GetPosition(_deviceName); + long writeAdvance = 0; + var preRoll = System.Diagnostics.Stopwatch.StartNew(); + while (preRoll.Elapsed.TotalSeconds < PreRollSeconds) + { + if (!_started || _disposed || source == null || !Microphone.IsRecording(_deviceName)) yield break; + yield return null; + int w = Microphone.GetPosition(_deviceName); + writeAdvance += ((w - prevWrite) % clipFrames + clipFrames) % clipFrames; + prevWrite = w; + } + if (!_started || _disposed || source == null) yield break; + + double fillRate = writeAdvance > 0 ? writeAdvance / preRoll.Elapsed.TotalSeconds : declaredRate; + double basePitch = fillRate / declaredRate; + // The lag target must stay well below the clip's real capacity (clipFrames at the + // true fill rate), which can be much shorter than lengthSec when the rate is misreported. + float capacityCapSec = (float)(0.4 * clipFrames / fillRate); + float targetLagSec = Mathf.Min(MinTargetLagSec, capacityCapSec); + + source.pitch = Mathf.Clamp((float)basePitch, MinPitch, MaxPitch); + source.Play(); + long targetLag = (long)(targetLagSec * fillRate); + int startRead = (int)(((prevWrite - targetLag) % clipFrames + clipFrames) % clipFrames); + source.timeSamples = startRead; + + Utils.Info($"MicrophoneSource pacing: measured={fillRate:F0}Hz declared={declaredRate}Hz pitch={source.pitch:F2} lag={targetLagSec * 1000:F0}ms"); + + int prevRead = startRead; + double lag = targetLag; // current read-behind-write distance, in clip samples + double smoothedLag = lag; + double jitter = 0; + long rateAdvance = 0; + var rateWindow = System.Diagnostics.Stopwatch.StartNew(); + var statusWindow = System.Diagnostics.Stopwatch.StartNew(); + + while (_started && !_disposed && source != null && Microphone.IsRecording(_deviceName)) + { + yield return null; + if (source == null) yield break; + + int w = Microphone.GetPosition(_deviceName); + int r = source.timeSamples; + // Unwrapped per-frame advances. A hitch longer than the clip aliases these; the + // resync guard below recovers from the resulting inconsistency. + long dw = ((w - prevWrite) % clipFrames + clipFrames) % clipFrames; + long dr = ((r - prevRead) % clipFrames + clipFrames) % clipFrames; + prevWrite = w; + prevRead = r; + lag += dw - dr; + rateAdvance += dw; + + smoothedLag = 0.95 * smoothedLag + 0.05 * lag; + jitter = 0.95 * jitter + 0.05 * Math.Abs(lag - smoothedLag); + + // Refine the fill rate and adapt the lag target once per second. + if (rateWindow.Elapsed.TotalSeconds >= 1.0) + { + double instRate = rateAdvance / rateWindow.Elapsed.TotalSeconds; + if (instRate > 0) + { + fillRate = 0.7 * fillRate + 0.3 * instRate; + basePitch = fillRate / declaredRate; + } + rateAdvance = 0; + rateWindow.Restart(); + + // Hold ~4x the observed jitter as safety margin, within bounds and capacity. + float jitterSec = (float)(jitter / fillRate); + capacityCapSec = (float)(0.4 * clipFrames / fillRate); + targetLagSec = Mathf.Min(Mathf.Clamp(jitterSec * 4f, MinTargetLagSec, MaxTargetLagSec), capacityCapSec); + } + + // Proportional pitch correction toward the target lag. + double target = targetLagSec * fillRate; + double relErr = (smoothedLag - target) / target; + relErr = Math.Max(-MaxRelativeCorrection, Math.Min(MaxRelativeCorrection, relErr)); + source.pitch = Mathf.Clamp((float)(basePitch * (1.0 + PitchCorrectionGain * relErr)), MinPitch, MaxPitch); + + // Out of bounds (reader overran the writer, or fell so far behind it reads + // overwritten data): jump back to the target lag. Audible once, then stable. + if (lag < 0 || lag > clipFrames * 0.9) + { + int resyncRead = (int)(((w - (long)target) % clipFrames + clipFrames) % clipFrames); + source.timeSamples = resyncRead; + prevRead = resyncRead; + lag = target; + smoothedLag = target; + Utils.Warning($"MicrophoneSource pacing: resync, lag reset to {targetLagSec * 1000:F0}ms (rate={fillRate:F0}Hz pitch={source.pitch:F2})"); + } + + if (statusWindow.Elapsed.TotalSeconds >= 5.0) + { + Utils.Info($"MicrophoneSource pacing: rate={fillRate:F0}Hz pitch={source.pitch:F2} lag={smoothedLag / fillRate * 1000:F0}ms target={targetLagSec * 1000:F0}ms jitter={jitter / fillRate * 1000:F1}ms"); + statusWindow.Restart(); + } + } + } + /// /// Stops capturing audio from the microphone. /// From 99e35ea3a1ed6402951aea1928337d0d23df2ff8 Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 15:39:22 +0200 Subject: [PATCH 5/7] Pin pitch to ~1; use GetPosition only rescaled to estimate the write head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Field test falsified the previous model: with pitch set to the measured counter ratio (3.2), the published audio became garbled repeats ("noise with echo"), while the servo's own lag telemetry stayed perfectly stable — because it was measuring against the same lying counter. Combined with earlier results (1x playback yields correct-pitch voice; reading at the counter's pace yields noise), the consistent model is: - The clip DATA genuinely is at clip.frequency (16kHz here). - Microphone.GetPosition's counter is inflated ~3.2x on macOS + BT-HFP; it does not describe the data. The choppiness on the plain path is the read head colliding with the bursty real write head due to a small, unmanaged startup lag — not a rate mismatch. Rework the servo accordingly: pitch stays pinned near 1.0 (max ±3% trim). The counter is used only after rescaling by its measured inflation factor k = counterRate / clip.frequency (~1 on healthy devices) to estimate the real write head, and the servo holds the read head a generous adaptive lag (150ms default) behind that estimate. Clip buffer extended to 2s for more collision headroom. Co-Authored-By: Claude Fable 5 --- Runtime/Scripts/MicrophoneSource.cs | 129 +++++++++++++++------------- 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index a8cd1c77..1d9a8e6a 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -15,20 +15,24 @@ sealed public class MicrophoneSource : RtcAudioSource { // --- Playback pacing servo --- // The mic clip is filled by the capture device's clock while the AudioSource that plays it - // (feeding AudioProbe/OnAudioFilterRead) is driven by the output device's clock. Worse, - // some devices misreport the clip rate entirely (a Bluetooth headset on macOS labeled its - // clip 16kHz while filling it at ~51kHz). Either way the read head drifts against the - // write head until it gets lapped, which sounds like periodic chopping. The servo measures - // how fast the write head actually advances and continuously adjusts AudioSource.pitch so - // the read head consumes clip samples at the same rate, holding a fixed lag behind the - // writer. In the normal case the measured rate matches clip.frequency and pitch stays ~1. - private const float PreRollSeconds = 0.3f; // initial fill-rate measurement window - private const float MinTargetLagSec = 0.08f; // smallest safety lag (good devices) - private const float MaxTargetLagSec = 0.25f; // adaptive ceiling (jittery devices) - private const float PitchCorrectionGain = 0.5f; // proportional gain on relative lag error - private const float MaxRelativeCorrection = 0.2f; - private const float MinPitch = 0.25f; - private const float MaxPitch = 8f; + // (feeding AudioProbe/OnAudioFilterRead) is driven by the output device's clock. With a + // small, unmanaged startup lag the read head collides with the (bursty) write head and the + // captured audio chops. + // + // Microphone.GetPosition cannot be trusted directly: on macOS with a Bluetooth HFP headset + // its counter advances ~3.2x faster than data is actually written (clip labeled 16kHz, + // counter at ~51k/s). The clip DATA is still genuinely at clip.frequency — playing it at + // 1x yields correct-pitch voice, while reading at the counter's pace yields garbled + // repeats. So the servo keeps pitch at ~1.0 and uses the counter only after rescaling by + // its measured inflation factor k (counterRate / clip.frequency, ~1 on healthy devices) to + // estimate the real write head, holding the read head a generous lag behind it with only + // tiny pitch trims. + private const float PreRollSeconds = 0.3f; // counter-rate measurement window + private const float DefaultTargetLagSec = 0.15f; // initial read-behind-write lag + private const float MinTargetLagSec = 0.10f; + private const float MaxTargetLagSec = 0.40f; // adaptive ceiling (jittery devices) + private const float TrimGain = 0.5f; // proportional gain on relative lag error + private const float MaxPitchTrim = 0.03f; // pitch stays within [0.97, 1.03] private readonly GameObject _sourceObject; private readonly string _deviceName; @@ -121,7 +125,7 @@ private IEnumerator StartMicrophone() clip = Microphone.Start( _deviceName, loop: true, - lengthSec: 1, + lengthSec: 2, frequency: micFrequency ); } @@ -183,50 +187,56 @@ private IEnumerator StartMicrophone() Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully"); } - // Keeps the AudioSource's read head locked a fixed lag behind the mic's write head by - // adjusting pitch (see the servo comment at the top of the class). Pitch is rate control, - // not a delay: the only latency this adds is the target lag itself. + // Keeps the AudioSource's read head a fixed lag behind the (estimated) real write head + // (see the servo comment at the top of the class). Pitch stays ~1.0 — the clip data rate + // IS clip.frequency — with only tiny trims; the added latency is the held lag itself. private IEnumerator PaceMicrophonePlayback(AudioSource source, AudioClip clip) { int clipFrames = clip.samples; int declaredRate = clip.frequency; - // Pre-roll: measure the true fill rate before playback starts. GetPosition's - // instantaneous position can be jumpy on misbehaving devices, but its average advance - // has measured steady (±0.1%), so a short window gives a reliable rate. - int prevWrite = Microphone.GetPosition(_deviceName); - long writeAdvance = 0; + // Pre-roll: measure how fast GetPosition's counter advances. Its instantaneous value + // can be jumpy, but its average advance is steady (±0.1% measured), so a short window + // gives a reliable rate. k is the counter's inflation relative to the data rate. + int prevCounter = Microphone.GetPosition(_deviceName); + long counterUnwrapped = prevCounter; // counter ran since Microphone.Start; small so far + long preRollStart = counterUnwrapped; var preRoll = System.Diagnostics.Stopwatch.StartNew(); while (preRoll.Elapsed.TotalSeconds < PreRollSeconds) { if (!_started || _disposed || source == null || !Microphone.IsRecording(_deviceName)) yield break; yield return null; - int w = Microphone.GetPosition(_deviceName); - writeAdvance += ((w - prevWrite) % clipFrames + clipFrames) % clipFrames; - prevWrite = w; + int c = Microphone.GetPosition(_deviceName); + counterUnwrapped += ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; + prevCounter = c; } if (!_started || _disposed || source == null) yield break; - double fillRate = writeAdvance > 0 ? writeAdvance / preRoll.Elapsed.TotalSeconds : declaredRate; - double basePitch = fillRate / declaredRate; - // The lag target must stay well below the clip's real capacity (clipFrames at the - // true fill rate), which can be much shorter than lengthSec when the rate is misreported. - float capacityCapSec = (float)(0.4 * clipFrames / fillRate); - float targetLagSec = Mathf.Min(MinTargetLagSec, capacityCapSec); + double counterRate = (counterUnwrapped - preRollStart) / preRoll.Elapsed.TotalSeconds; + if (counterRate <= 0) counterRate = declaredRate; + double k = counterRate / declaredRate; // ~1 on healthy devices, ~3.2 on macOS BT-HFP - source.pitch = Mathf.Clamp((float)basePitch, MinPitch, MaxPitch); - source.Play(); - long targetLag = (long)(targetLagSec * fillRate); - int startRead = (int)(((prevWrite - targetLag) % clipFrames + clipFrames) % clipFrames); - source.timeSamples = startRead; + // Lag target, bounded by the clip's data capacity (clipFrames samples). + float capacityCapSec = 0.4f * clipFrames / declaredRate; + float targetLagSec = Mathf.Min(DefaultTargetLagSec, capacityCapSec); + double target = targetLagSec * declaredRate; - Utils.Info($"MicrophoneSource pacing: measured={fillRate:F0}Hz declared={declaredRate}Hz pitch={source.pitch:F2} lag={targetLagSec * 1000:F0}ms"); + // Estimated real write head in data samples: the counter rescaled by k (both started + // at zero when capture began). + double writeEst = counterUnwrapped / k; + source.pitch = 1f; + source.Play(); + int startRead = (int)((((long)(writeEst - target)) % clipFrames + clipFrames) % clipFrames); + source.timeSamples = startRead; int prevRead = startRead; - double lag = targetLag; // current read-behind-write distance, in clip samples + double lag = target; // data samples the reader trails the estimated writer double smoothedLag = lag; double jitter = 0; - long rateAdvance = 0; + + Utils.Info($"MicrophoneSource pacing: counter={counterRate:F0}/s k={k:F2} dataRate={declaredRate}Hz lag={targetLagSec * 1000:F0}ms"); + + long counterWindow = 0; var rateWindow = System.Diagnostics.Stopwatch.StartNew(); var statusWindow = System.Diagnostics.Stopwatch.StartNew(); @@ -235,59 +245,60 @@ private IEnumerator PaceMicrophonePlayback(AudioSource source, AudioClip clip) yield return null; if (source == null) yield break; - int w = Microphone.GetPosition(_deviceName); + int c = Microphone.GetPosition(_deviceName); int r = source.timeSamples; // Unwrapped per-frame advances. A hitch longer than the clip aliases these; the // resync guard below recovers from the resulting inconsistency. - long dw = ((w - prevWrite) % clipFrames + clipFrames) % clipFrames; + long dc = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; long dr = ((r - prevRead) % clipFrames + clipFrames) % clipFrames; - prevWrite = w; + prevCounter = c; prevRead = r; - lag += dw - dr; - rateAdvance += dw; + counterUnwrapped += dc; + counterWindow += dc; + lag += dc / k - dr; smoothedLag = 0.95 * smoothedLag + 0.05 * lag; jitter = 0.95 * jitter + 0.05 * Math.Abs(lag - smoothedLag); - // Refine the fill rate and adapt the lag target once per second. + // Refine the counter rate and adapt the lag target once per second. if (rateWindow.Elapsed.TotalSeconds >= 1.0) { - double instRate = rateAdvance / rateWindow.Elapsed.TotalSeconds; + double instRate = counterWindow / rateWindow.Elapsed.TotalSeconds; if (instRate > 0) { - fillRate = 0.7 * fillRate + 0.3 * instRate; - basePitch = fillRate / declaredRate; + counterRate = 0.7 * counterRate + 0.3 * instRate; + k = counterRate / declaredRate; } - rateAdvance = 0; + counterWindow = 0; rateWindow.Restart(); // Hold ~4x the observed jitter as safety margin, within bounds and capacity. - float jitterSec = (float)(jitter / fillRate); - capacityCapSec = (float)(0.4 * clipFrames / fillRate); + float jitterSec = (float)(jitter / declaredRate); targetLagSec = Mathf.Min(Mathf.Clamp(jitterSec * 4f, MinTargetLagSec, MaxTargetLagSec), capacityCapSec); + target = targetLagSec * declaredRate; } - // Proportional pitch correction toward the target lag. - double target = targetLagSec * fillRate; + // Tiny proportional pitch trim toward the target lag. The data rate is + // clip.frequency, so pitch must stay pinned near 1. double relErr = (smoothedLag - target) / target; - relErr = Math.Max(-MaxRelativeCorrection, Math.Min(MaxRelativeCorrection, relErr)); - source.pitch = Mathf.Clamp((float)(basePitch * (1.0 + PitchCorrectionGain * relErr)), MinPitch, MaxPitch); + relErr = Math.Max(-1.0, Math.Min(1.0, relErr)); + source.pitch = 1f + Mathf.Clamp((float)(TrimGain * relErr) * MaxPitchTrim, -MaxPitchTrim, MaxPitchTrim); // Out of bounds (reader overran the writer, or fell so far behind it reads // overwritten data): jump back to the target lag. Audible once, then stable. if (lag < 0 || lag > clipFrames * 0.9) { - int resyncRead = (int)(((w - (long)target) % clipFrames + clipFrames) % clipFrames); + int resyncRead = (int)((((long)(counterUnwrapped / k - target)) % clipFrames + clipFrames) % clipFrames); source.timeSamples = resyncRead; prevRead = resyncRead; lag = target; smoothedLag = target; - Utils.Warning($"MicrophoneSource pacing: resync, lag reset to {targetLagSec * 1000:F0}ms (rate={fillRate:F0}Hz pitch={source.pitch:F2})"); + Utils.Warning($"MicrophoneSource pacing: resync, lag reset to {targetLagSec * 1000:F0}ms (k={k:F2} pitch={source.pitch:F3})"); } if (statusWindow.Elapsed.TotalSeconds >= 5.0) { - Utils.Info($"MicrophoneSource pacing: rate={fillRate:F0}Hz pitch={source.pitch:F2} lag={smoothedLag / fillRate * 1000:F0}ms target={targetLagSec * 1000:F0}ms jitter={jitter / fillRate * 1000:F1}ms"); + Utils.Info($"MicrophoneSource pacing: k={k:F2} pitch={source.pitch:F3} lag={smoothedLag / declaredRate * 1000:F0}ms target={targetLagSec * 1000:F0}ms jitter={jitter / declaredRate * 1000:F1}ms"); statusWindow.Restart(); } } From e17fd2bbd75309728d9c84df35441b79a29317aa Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 15:46:07 +0200 Subject: [PATCH 6/7] Add editor-only mic clip WAV dump (temp diagnostic) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reworked servo's telemetry is perfect in the bad state (k=3.20, pitch~1.00, lag locked on target, jitter ~0, no resyncs) yet the published audio still chops like the unpaced path. That falsifies the read/write collision model: the reader is provably never near the writer. Remaining hypothesis: the chop is baked into the clip data itself — FMOD scatters the real 16kHz samples at the inflated counter's positions, leaving stale regions between fragments (~31% fresh per cycle). That would also explain why counter-paced reading sounds like noise with echo (fragments + stale older audio, fast). Snapshot the raw clip to a WAV 4s after capture starts (editor-only) so the buffer contents can be inspected directly: contiguous voice means the chop is downstream and still fixable; fragmented voice means capture data is destroyed at write time and the Unity Microphone path cannot work for this device. Co-Authored-By: Claude Fable 5 --- Runtime/Scripts/MicrophoneSource.cs | 47 +++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index 1d9a8e6a..16ddea69 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -184,9 +184,56 @@ private IEnumerator StartMicrophone() // Playback is started by the pacing servo, which first measures the clip's true fill // rate so the initial pitch and read position are right from the first sample. MonoBehaviourContext.RunCoroutine(PaceMicrophonePlayback(source, clip)); +#if UNITY_EDITOR + MonoBehaviourContext.RunCoroutine(DumpClipOnce(clip)); +#endif Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully"); } +#if UNITY_EDITOR + // TEMP diagnostic: snapshots the raw mic clip to a WAV so its contents can be inspected + // offline — is it one contiguous audio stream, or voice fragments scattered between stale + // regions? Speak continuously for the first ~5 seconds of capture. Editor-only. + private IEnumerator DumpClipOnce(AudioClip clip) + { + yield return new WaitForSeconds(4f); + if (_disposed || clip == null) yield break; + try + { + var data = new float[clip.samples * clip.channels]; + clip.GetData(data, 0); + var path = System.IO.Path.Combine(System.IO.Path.GetTempPath(), "lk_mic_clip.wav"); + WriteWav(path, data, clip.channels, clip.frequency); + Utils.Info($"MicrophoneSource: dumped clip snapshot to {path} ({clip.samples} frames @ {clip.frequency}Hz/{clip.channels}ch)"); + } + catch (Exception e) + { + Utils.Warning($"MicrophoneSource: clip dump failed: {e.Message}"); + } + } + + private static void WriteWav(string path, float[] samples, int channels, int sampleRate) + { + using var fs = new System.IO.FileStream(path, System.IO.FileMode.Create); + using var w = new System.IO.BinaryWriter(fs); + int dataBytes = samples.Length * 2; + w.Write(System.Text.Encoding.ASCII.GetBytes("RIFF")); + w.Write(36 + dataBytes); + w.Write(System.Text.Encoding.ASCII.GetBytes("WAVEfmt ")); + w.Write(16); + w.Write((short)1); // PCM + w.Write((short)channels); + w.Write(sampleRate); + w.Write(sampleRate * channels * 2); + w.Write((short)(channels * 2)); // block align + w.Write((short)16); // bits per sample + w.Write(System.Text.Encoding.ASCII.GetBytes("data")); + w.Write(dataBytes); + foreach (var s in samples) + w.Write((short)(Mathf.Clamp(s, -1f, 1f) * 32767f)); + } +#endif + // Keeps the AudioSource's read head a fixed lag behind the (estimated) real write head // (see the servo comment at the top of the class). Pitch stays ~1.0 — the clip data rate // IS clip.frequency — with only tiny trims; the added latency is the held lag itself. From 8ff252ba164642729138a5b53a24ad122f87fa03 Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:00:54 +0200 Subject: [PATCH 7/7] Reconstruct fragmented mic clip by reading valid samples per stride A raw dump of the mic clip in the macOS + Bluetooth HFP state revealed the true buffer structure: FMOD writes each real 20ms packet of clip.frequency audio, then advances the position counter as if it had written k (~3.2x) as much and zero-fills the skipped range. The buffer holds valid fragments of exactly 320 samples at a stride of exactly 1024 (320/1024 = 1/k), and the fragments join continuously (junction sample deltas within normal in-fragment variation) - i.e. the full audio stream is present, just zero-padded. Concatenating the fragments reconstructed clean, correct-pitch voice (verified by ear), which also explains every earlier symptom: plain playback = 31% voice + 69% silence (chop); counter-paced reading = fragments and padding played fast over a live buffer (noise with echo). Replace the pitch-servo playback approach with fragment-aware direct capture: - Read the clip ring buffer directly (no AudioSource, no OnAudioFilterRead), which also decouples capture from the output device's clock. - Pre-roll measures the counter rate (k = counterRate / clip.frequency) and the counter's smallest discrete jump (the stride J). - k ~ 1: plain contiguous read at the counter's pace (healthy devices). - k > 1.05: read only the first J/k samples of each stride - exactly the valid fragments - skipping the zero padding. - Downmix to mono and resample clip.frequency -> 48kHz (streaming linear; state carries across fragments since their junctions are continuous), into a native source fixed at 48kHz mono. - Backlog beyond 200ms after a stall is dropped, stride-aligned, to avoid overrunning the native queue. Co-Authored-By: Claude Fable 5 --- Runtime/Scripts/MicrophoneSource.cs | 392 +++++++++++----------------- 1 file changed, 157 insertions(+), 235 deletions(-) diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index 16ddea69..1a1823f0 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -1,5 +1,6 @@ using System; using System.Collections; +using System.Collections.Generic; using UnityEngine; using LiveKit.Internal; @@ -13,46 +14,59 @@ namespace LiveKit /// sealed public class MicrophoneSource : RtcAudioSource { - // --- Playback pacing servo --- - // The mic clip is filled by the capture device's clock while the AudioSource that plays it - // (feeding AudioProbe/OnAudioFilterRead) is driven by the output device's clock. With a - // small, unmanaged startup lag the read head collides with the (bursty) write head and the - // captured audio chops. + // --- Capture design --- + // The microphone clip's ring buffer is read directly (no AudioSource playback, no + // OnAudioFilterRead), so capture is decoupled from the output device's clock. // - // Microphone.GetPosition cannot be trusted directly: on macOS with a Bluetooth HFP headset - // its counter advances ~3.2x faster than data is actually written (clip labeled 16kHz, - // counter at ~51k/s). The clip DATA is still genuinely at clip.frequency — playing it at - // 1x yields correct-pitch voice, while reading at the counter's pace yields garbled - // repeats. So the servo keeps pitch at ~1.0 and uses the counter only after rescaling by - // its measured inflation factor k (counterRate / clip.frequency, ~1 on healthy devices) to - // estimate the real write head, holding the read head a generous lag behind it with only - // tiny pitch trims. - private const float PreRollSeconds = 0.3f; // counter-rate measurement window - private const float DefaultTargetLagSec = 0.15f; // initial read-behind-write lag - private const float MinTargetLagSec = 0.10f; - private const float MaxTargetLagSec = 0.40f; // adaptive ceiling (jittery devices) - private const float TrimGain = 0.5f; // proportional gain on relative lag error - private const float MaxPitchTrim = 0.03f; // pitch stays within [0.97, 1.03] - - private readonly GameObject _sourceObject; + // Microphone.GetPosition cannot be trusted as a sample position on every platform. On + // macOS with a Bluetooth HFP headset, FMOD writes each real 20ms packet of clip.frequency + // audio, then advances the position counter ~3.2x too far and zero-fills the skipped + // range. The buffer then holds valid fragments of N samples at a stride J (measured: 320 + // of every 1024) and the counter rate is k = J/N times the data rate. Inspection of a raw + // buffer dump showed the fragments are consecutive speech that joins continuously, so the + // stream is reconstructed losslessly by reading only the first N = J/k samples of each + // stride. Healthy devices have k ~ 1 and use a plain contiguous read. + // + // The clip's data rate is clip.frequency (verified: fragments play at correct pitch), so + // captured samples are resampled from clip.frequency to the fixed native-source rate. + private const uint TargetSampleRate = 48000; + private const float PreRollSeconds = 0.3f; + private const double FragmentedKThreshold = 1.05; + private const float MaxBacklogSeconds = 0.2f; // drop backlog beyond this after a stall + private readonly string _deviceName; public override event Action AudioRead; private bool _disposed = false; private bool _started = false; + private volatile bool _capturing = false; + + // Streaming linear-resampler state (input = clip.frequency, output = TargetSampleRate). + private double _resamplePos; + private float _resamplePrev; /// /// Creates a new microphone source for the given device. /// /// The name of the device to capture from. Use to /// get the list of available devices. - /// The GameObject to attach the AudioSource to. The object must be kept in the scene - /// for the duration of the source's lifetime. - public MicrophoneSource(string deviceName, GameObject sourceObject) : base(RtcAudioSourceType.AudioSourceMicrophone) + /// Unused; retained for compatibility. The microphone clip is read + /// directly, so no scene GameObject/AudioSource is required. + public MicrophoneSource(string deviceName, GameObject sourceObject) + : base(RtcAudioSourceType.AudioSourceMicrophone, TargetSampleRate, 1) { _deviceName = deviceName; - _sourceObject = sourceObject; + } + + // The rate requested from Microphone.Start (a hint the platform may not honor), clamped to + // the device's reported range. The authoritative data rate is clip.frequency afterwards. + private static int ResolveRequestedSampleRate(string deviceName) + { + Microphone.GetDeviceCaps(deviceName, out int minFreq, out int maxFreq); + if (minFreq == 0 && maxFreq == 0) + return (int)TargetSampleRate; + return Mathf.Clamp((int)TargetSampleRate, minFreq, maxFreq); } /// @@ -70,7 +84,6 @@ public override void Start() base.Start(); if (_started) return; - if (!Application.HasUserAuthorization(mode: UserAuthorization.Microphone)) throw new InvalidOperationException("Microphone access not authorized"); @@ -80,37 +93,8 @@ public override void Start() _started = true; } - // Opens the microphone at the engine's output sample rate when the device supports it, so - // the captured clip and the AudioSource that plays it back run at the same rate. A mismatch - // makes the looping clip drift against the playback read position and produces choppy audio. - // Falls back to DefaultMicrophoneSampleRate when the output rate is unknown, and clamps to - // the device's supported range when it reports one. - private static int ResolveMicrophoneSampleRate(string deviceName) - { - int target = AudioSettings.outputSampleRate; - if (target <= 0) - target = (int)DefaultMicrophoneSampleRate; - - Microphone.GetDeviceCaps(deviceName, out int minFreq, out int maxFreq); - // Unity reports (0, 0) when the device imposes no specific sample-rate range. - if (minFreq == 0 && maxFreq == 0) - return target; - - var result = Mathf.Clamp(target, minFreq, maxFreq); - Utils.Info($"ResolveMicrophoneSampleRate: {result}"); - - return result; - } - private IEnumerator StartMicrophone() { - // Validate that the GameObject is still valid before starting - if (_sourceObject == null) - { - Utils.Error("MicrophoneSource: GameObject is null, cannot start microphone"); - yield break; - } - // Verify microphone is still authorized (could change during background) if (!Application.HasUserAuthorization(UserAuthorization.Microphone)) { @@ -119,14 +103,14 @@ private IEnumerator StartMicrophone() } AudioClip clip = null; - var micFrequency = ResolveMicrophoneSampleRate(_deviceName); + int requestedRate = ResolveRequestedSampleRate(_deviceName); try { clip = Microphone.Start( _deviceName, loop: true, lengthSec: 2, - frequency: micFrequency + frequency: requestedRate ); } catch (Exception e) @@ -141,31 +125,6 @@ private IEnumerator StartMicrophone() yield break; } - Utils.Info($"MicrophoneSource device='{_deviceName}' opened at {micFrequency}Hz (output={AudioSettings.outputSampleRate}Hz)"); - - // Ensure no duplicate components exist before adding new ones. - // This is important during app resume on iOS where components might not be - // fully destroyed yet due to Unity's deferred Destroy(). - var existingSource = _sourceObject.GetComponent(); - if (existingSource != null) - UnityEngine.Object.DestroyImmediate(existingSource); - - var existingProbe = _sourceObject.GetComponent(); - if (existingProbe != null) - { - existingProbe.AudioRead -= OnAudioRead; - UnityEngine.Object.DestroyImmediate(existingProbe); - } - - var source = _sourceObject.AddComponent(); - source.clip = clip; - source.loop = true; - - var probe = _sourceObject.AddComponent(); - // Clear the audio data after it is read as to not play it through the speaker locally. - probe.ClearAfterInvocation(); - probe.AudioRead += OnAudioRead; - // Wait for microphone to actually start producing data with a timeout const float timeout = 2f; float elapsed = 0f; @@ -181,174 +140,155 @@ private IEnumerator StartMicrophone() yield break; } - // Playback is started by the pacing servo, which first measures the clip's true fill - // rate so the initial pitch and read position are right from the first sample. - MonoBehaviourContext.RunCoroutine(PaceMicrophonePlayback(source, clip)); -#if UNITY_EDITOR - MonoBehaviourContext.RunCoroutine(DumpClipOnce(clip)); -#endif - Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully"); - } - -#if UNITY_EDITOR - // TEMP diagnostic: snapshots the raw mic clip to a WAV so its contents can be inspected - // offline — is it one contiguous audio stream, or voice fragments scattered between stale - // regions? Speak continuously for the first ~5 seconds of capture. Editor-only. - private IEnumerator DumpClipOnce(AudioClip clip) - { - yield return new WaitForSeconds(4f); - if (_disposed || clip == null) yield break; - try - { - var data = new float[clip.samples * clip.channels]; - clip.GetData(data, 0); - var path = System.IO.Path.Combine(System.IO.Path.GetTempPath(), "lk_mic_clip.wav"); - WriteWav(path, data, clip.channels, clip.frequency); - Utils.Info($"MicrophoneSource: dumped clip snapshot to {path} ({clip.samples} frames @ {clip.frequency}Hz/{clip.channels}ch)"); - } - catch (Exception e) - { - Utils.Warning($"MicrophoneSource: clip dump failed: {e.Message}"); - } - } + Utils.Info($"MicrophoneSource device='{_deviceName}' clip={clip.frequency}Hz/{clip.channels}ch samples={clip.samples} requested={requestedRate}Hz target={TargetSampleRate}Hz"); - private static void WriteWav(string path, float[] samples, int channels, int sampleRate) - { - using var fs = new System.IO.FileStream(path, System.IO.FileMode.Create); - using var w = new System.IO.BinaryWriter(fs); - int dataBytes = samples.Length * 2; - w.Write(System.Text.Encoding.ASCII.GetBytes("RIFF")); - w.Write(36 + dataBytes); - w.Write(System.Text.Encoding.ASCII.GetBytes("WAVEfmt ")); - w.Write(16); - w.Write((short)1); // PCM - w.Write((short)channels); - w.Write(sampleRate); - w.Write(sampleRate * channels * 2); - w.Write((short)(channels * 2)); // block align - w.Write((short)16); // bits per sample - w.Write(System.Text.Encoding.ASCII.GetBytes("data")); - w.Write(dataBytes); - foreach (var s in samples) - w.Write((short)(Mathf.Clamp(s, -1f, 1f) * 32767f)); + _capturing = true; + MonoBehaviourContext.RunCoroutine(CaptureLoop(clip)); } -#endif - // Keeps the AudioSource's read head a fixed lag behind the (estimated) real write head - // (see the servo comment at the top of the class). Pitch stays ~1.0 — the clip data rate - // IS clip.frequency — with only tiny trims; the added latency is the held lag itself. - private IEnumerator PaceMicrophonePlayback(AudioSource source, AudioClip clip) + // Reads new samples from the clip's ring buffer each frame and pushes them to the native + // source via AudioRead. Runs on the main thread; the native source's queue absorbs the + // per-frame pacing jitter. + private IEnumerator CaptureLoop(AudioClip clip) { int clipFrames = clip.samples; - int declaredRate = clip.frequency; + int channels = clip.channels; + int dataRate = clip.frequency > 0 ? clip.frequency : (int)DefaultMicrophoneSampleRate; - // Pre-roll: measure how fast GetPosition's counter advances. Its instantaneous value - // can be jumpy, but its average advance is steady (±0.1% measured), so a short window - // gives a reliable rate. k is the counter's inflation relative to the data rate. + // Pre-roll: measure how fast the position counter advances (its average is steady even + // when individual values jump) and the size of its smallest discrete jump. int prevCounter = Microphone.GetPosition(_deviceName); - long counterUnwrapped = prevCounter; // counter ran since Microphone.Start; small so far - long preRollStart = counterUnwrapped; + long advance = 0; + long minJump = long.MaxValue; var preRoll = System.Diagnostics.Stopwatch.StartNew(); while (preRoll.Elapsed.TotalSeconds < PreRollSeconds) { - if (!_started || _disposed || source == null || !Microphone.IsRecording(_deviceName)) yield break; + if (!_capturing || _disposed) yield break; yield return null; int c = Microphone.GetPosition(_deviceName); - counterUnwrapped += ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; + long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; prevCounter = c; + advance += d; + if (d > 0 && d < minJump) minJump = d; } - if (!_started || _disposed || source == null) yield break; - - double counterRate = (counterUnwrapped - preRollStart) / preRoll.Elapsed.TotalSeconds; - if (counterRate <= 0) counterRate = declaredRate; - double k = counterRate / declaredRate; // ~1 on healthy devices, ~3.2 on macOS BT-HFP + if (!_capturing || _disposed) yield break; - // Lag target, bounded by the clip's data capacity (clipFrames samples). - float capacityCapSec = 0.4f * clipFrames / declaredRate; - float targetLagSec = Mathf.Min(DefaultTargetLagSec, capacityCapSec); - double target = targetLagSec * declaredRate; + double counterRate = advance > 0 ? advance / preRoll.Elapsed.TotalSeconds : dataRate; + double k = counterRate / dataRate; - // Estimated real write head in data samples: the counter rescaled by k (both started - // at zero when capture began). - double writeEst = counterUnwrapped / k; + // Fragmented mode: the counter advances in jumps of `stride`, but only the first + // `validPerStride` samples of each stride contain data; the rest is zero padding. + bool fragmented = k > FragmentedKThreshold && minJump != long.MaxValue && minJump > 1; + int stride = fragmented ? (int)minJump : 0; + int validPerStride = fragmented ? Math.Max(1, (int)Math.Round(stride / k)) : 0; - source.pitch = 1f; - source.Play(); - int startRead = (int)((((long)(writeEst - target)) % clipFrames + clipFrames) % clipFrames); - source.timeSamples = startRead; - int prevRead = startRead; - double lag = target; // data samples the reader trails the estimated writer - double smoothedLag = lag; - double jitter = 0; - - Utils.Info($"MicrophoneSource pacing: counter={counterRate:F0}/s k={k:F2} dataRate={declaredRate}Hz lag={targetLagSec * 1000:F0}ms"); + if (fragmented) + Utils.Info($"MicrophoneSource: fragmented clip detected (k={k:F2}); reading {validPerStride} of every {stride} samples at {dataRate}Hz"); + else + Utils.Info($"MicrophoneSource: contiguous capture (k={k:F2}) at {dataRate}Hz"); - long counterWindow = 0; - var rateWindow = System.Diagnostics.Stopwatch.StartNew(); - var statusWindow = System.Diagnostics.Stopwatch.StartNew(); + _resamplePos = 0.0; + _resamplePrev = 0f; + long maxBacklog = (long)(counterRate * MaxBacklogSeconds); + int readPos = prevCounter; // counter values land on jump boundaries + long pending = 0; - while (_started && !_disposed && source != null && Microphone.IsRecording(_deviceName)) + while (_capturing && !_disposed) { yield return null; - if (source == null) yield break; int c = Microphone.GetPosition(_deviceName); - int r = source.timeSamples; - // Unwrapped per-frame advances. A hitch longer than the clip aliases these; the - // resync guard below recovers from the resulting inconsistency. - long dc = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; - long dr = ((r - prevRead) % clipFrames + clipFrames) % clipFrames; + long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; prevCounter = c; - prevRead = r; - counterUnwrapped += dc; - counterWindow += dc; - lag += dc / k - dr; + pending += d; - smoothedLag = 0.95 * smoothedLag + 0.05 * lag; - jitter = 0.95 * jitter + 0.05 * Math.Abs(lag - smoothedLag); + // After a long stall, drop the oldest backlog instead of pushing a burst that + // would overrun the native source's queue. + if (pending > maxBacklog) + { + long drop = pending - maxBacklog; + if (fragmented) drop -= drop % stride; // preserve stride alignment + readPos = (int)((readPos + drop) % clipFrames); + pending -= drop; + Utils.Warning($"MicrophoneSource: dropped {drop} buffered samples after a stall"); + } - // Refine the counter rate and adapt the lag target once per second. - if (rateWindow.Elapsed.TotalSeconds >= 1.0) + if (fragmented) { - double instRate = counterWindow / rateWindow.Elapsed.TotalSeconds; - if (instRate > 0) + while (pending >= stride) { - counterRate = 0.7 * counterRate + 0.3 * instRate; - k = counterRate / declaredRate; + EmitClipRange(clip, channels, dataRate, readPos, validPerStride, clipFrames); + readPos = (readPos + stride) % clipFrames; + pending -= stride; } - counterWindow = 0; - rateWindow.Restart(); - - // Hold ~4x the observed jitter as safety margin, within bounds and capacity. - float jitterSec = (float)(jitter / declaredRate); - targetLagSec = Mathf.Min(Mathf.Clamp(jitterSec * 4f, MinTargetLagSec, MaxTargetLagSec), capacityCapSec); - target = targetLagSec * declaredRate; } - - // Tiny proportional pitch trim toward the target lag. The data rate is - // clip.frequency, so pitch must stay pinned near 1. - double relErr = (smoothedLag - target) / target; - relErr = Math.Max(-1.0, Math.Min(1.0, relErr)); - source.pitch = 1f + Mathf.Clamp((float)(TrimGain * relErr) * MaxPitchTrim, -MaxPitchTrim, MaxPitchTrim); - - // Out of bounds (reader overran the writer, or fell so far behind it reads - // overwritten data): jump back to the target lag. Audible once, then stable. - if (lag < 0 || lag > clipFrames * 0.9) + else if (pending > 0) { - int resyncRead = (int)((((long)(counterUnwrapped / k - target)) % clipFrames + clipFrames) % clipFrames); - source.timeSamples = resyncRead; - prevRead = resyncRead; - lag = target; - smoothedLag = target; - Utils.Warning($"MicrophoneSource pacing: resync, lag reset to {targetLagSec * 1000:F0}ms (k={k:F2} pitch={source.pitch:F3})"); + EmitClipRange(clip, channels, dataRate, readPos, (int)pending, clipFrames); + readPos = (int)((readPos + pending) % clipFrames); + pending = 0; } + } + } + + // Reads `count` frames starting at `start`, splitting at the ring wrap so each GetData + // read is contiguous. + private void EmitClipRange(AudioClip clip, int channels, int dataRate, int start, int count, int clipFrames) + { + if (count <= 0) return; + int first = Math.Min(count, clipFrames - start); + ReadAndPush(clip, channels, dataRate, start, first); + if (count > first) + ReadAndPush(clip, channels, dataRate, 0, count - first); + } - if (statusWindow.Elapsed.TotalSeconds >= 5.0) + // Reads a contiguous range, downmixes to mono, resamples dataRate -> TargetSampleRate + // (streaming linear interpolation carrying state across calls, so fragment junctions stay + // continuous), and fires AudioRead. + private void ReadAndPush(AudioClip clip, int channels, int dataRate, int start, int count) + { + if (count <= 0) return; + + var interleaved = new float[count * channels]; + clip.GetData(interleaved, start); + + float[] mono; + if (channels == 1) + { + mono = interleaved; + } + else + { + mono = new float[count]; + for (int f = 0; f < count; f++) { - Utils.Info($"MicrophoneSource pacing: k={k:F2} pitch={source.pitch:F3} lag={smoothedLag / declaredRate * 1000:F0}ms target={targetLagSec * 1000:F0}ms jitter={jitter / declaredRate * 1000:F1}ms"); - statusWindow.Restart(); + float sum = 0f; + for (int ch = 0; ch < channels; ch++) + sum += interleaved[f * channels + ch]; + mono[f] = sum / channels; } } + + double step = (double)dataRate / TargetSampleRate; + var output = new List((int)(count / step) + 2); + + // Index -1 maps to the carried last sample of the previous chunk so interpolation is + // continuous across chunk boundaries. pos stays >= -1. + double pos = _resamplePos; + while (pos < count - 1) + { + int i0 = (int)Math.Floor(pos); + float a = i0 < 0 ? _resamplePrev : mono[i0]; + float b = mono[i0 + 1]; + float frac = (float)(pos - i0); + output.Add(a * (1f - frac) + b * frac); + pos += step; + } + _resamplePrev = mono[count - 1]; + _resamplePos = pos - count; + + if (output.Count > 0) + AudioRead?.Invoke(output.ToArray(), 1, (int)TargetSampleRate); } /// @@ -364,33 +304,15 @@ public override void Stop() private IEnumerator StopMicrophone() { + _capturing = false; + if (Microphone.IsRecording(_deviceName)) Microphone.End(_deviceName); - // Check if GameObject is still valid before trying to access components - if (_sourceObject != null) - { - var probe = _sourceObject.GetComponent(); - if (probe != null) - { - probe.AudioRead -= OnAudioRead; - UnityEngine.Object.Destroy(probe); - } - - var source = _sourceObject.GetComponent(); - if (source != null) - UnityEngine.Object.Destroy(source); - } - Utils.Debug($"MicrophoneSource device='{_deviceName}' stopped"); yield return null; } - private void OnAudioRead(float[] data, int channels, int sampleRate) - { - AudioRead?.Invoke(data, channels, sampleRate); - } - private void OnApplicationPause(bool pause) { if (!_started) @@ -458,4 +380,4 @@ protected override void Dispose(bool disposing) Dispose(false); } } -} \ No newline at end of file +}