diff --git a/Runtime/Scripts/AudioStream.cs b/Runtime/Scripts/AudioStream.cs index 4d9ab588..3288f839 100644 --- a/Runtime/Scripts/AudioStream.cs +++ b/Runtime/Scripts/AudioStream.cs @@ -50,6 +50,14 @@ public sealed class AudioStream : IDisposable private const int CrossfadeFrames = 128; // ~2.7ms @ 48kHz private int _skipCooldown = 0; + // --- Temporary receive diagnostics (Info level, emitted ~every 2s) --- + // Reveals whether choppiness is a buffer-starvation problem (underruns/low fill) versus a + // clean stream, and what rate/channels we are actually playing/requesting. + private long _diagWindowStartTicks; + private int _diagCallbacks; + private int _diagUnderruns; + private int _diagFramesReceived; + /// /// Creates a new audio stream from a remote audio track, attaching it to the /// given in the scene. @@ -147,6 +155,8 @@ private void OnAudioRead(float[] data, int channels, int sampleRate) lock (_lock) { + MaybeLogReceiveDiagnostics(channels, sampleRate); + // Single gate covering first-create and runtime format changes (e.g. after a // system audio device switch). When the FFI stream is missing or what we asked // Rust for no longer matches what Unity is delivering, post a (re)create to the @@ -214,6 +224,7 @@ static float S16ToFloat(short v) if (valuesAvailableToRead < data.Length) { _isPrimed = false; + _diagUnderruns++; Utils.Debug($"AudioStream underrun detected, re-priming (got {valuesAvailableToRead} samples but want to read {data.Length})"); // Output silence immediately instead of playing partial/choppy samples. @@ -370,6 +381,7 @@ private void OnAudioStreamEvent(AudioStreamEvent e) var data = new ReadOnlySpan(frame.Data.ToPointer(), frame.Length); _buffer.Write(data); } + _diagFramesReceived++; } } @@ -427,6 +439,25 @@ private void Dispose(bool disposing) Dispose(false); } + // Temporary diagnostic: ~every 2s logs buffer fill, underrun count, callback count and + // frames received so we can tell starvation (choppy) from a clean stream. Called under _lock. + private void MaybeLogReceiveDiagnostics(int channels, int sampleRate) + { + _diagCallbacks++; + var now = System.Diagnostics.Stopwatch.GetTimestamp(); + if (_diagWindowStartTicks == 0) _diagWindowStartTicks = now; + var elapsed = (now - _diagWindowStartTicks) / (double)System.Diagnostics.Stopwatch.Frequency; + if (elapsed < 2.0) return; + + float fill = _buffer != null ? _buffer.AvailableReadInPercent() : 0f; + Utils.Info($"AudioStream#{_trackHandleId} diag: out={sampleRate}Hz/{channels}ch ffi={_ffiSampleRate}Hz/{_ffiNumChannels}ch " + + $"bufferFill={fill * 100f:F0}% callbacks={_diagCallbacks} underruns={_diagUnderruns} framesRecv={_diagFramesReceived} over={elapsed:F1}s"); + _diagWindowStartTicks = now; + _diagCallbacks = 0; + _diagUnderruns = 0; + _diagFramesReceived = 0; + } + // For testing and debugging internal float GetBufferFill() { diff --git a/Runtime/Scripts/BasicAudioSource.cs b/Runtime/Scripts/BasicAudioSource.cs index 3b63680b..8193090d 100644 --- a/Runtime/Scripts/BasicAudioSource.cs +++ b/Runtime/Scripts/BasicAudioSource.cs @@ -19,9 +19,11 @@ sealed public class BasicAudioSource : RtcAudioSource /// Creates a new basic audio source for the given in the scene. /// /// The to capture from. - /// The number of channels to capture. /// The type of audio source. - public BasicAudioSource(AudioSource source, int channels = 2, RtcAudioSourceType sourceType = RtcAudioSourceType.AudioSourceCustom) : base(channels, sourceType) + /// + /// The sample rate and channel count are taken from Unity's audio configuration. + /// + public BasicAudioSource(AudioSource source, RtcAudioSourceType sourceType = RtcAudioSourceType.AudioSourceCustom) : base(sourceType) { _source = source; } diff --git a/Runtime/Scripts/MicrophoneSource.cs b/Runtime/Scripts/MicrophoneSource.cs index 904b8da7..1a1823f0 100644 --- a/Runtime/Scripts/MicrophoneSource.cs +++ b/Runtime/Scripts/MicrophoneSource.cs @@ -1,5 +1,6 @@ using System; using System.Collections; +using System.Collections.Generic; using UnityEngine; using LiveKit.Internal; @@ -13,25 +14,59 @@ namespace LiveKit /// sealed public class MicrophoneSource : RtcAudioSource { - private readonly GameObject _sourceObject; + // --- Capture design --- + // The microphone clip's ring buffer is read directly (no AudioSource playback, no + // OnAudioFilterRead), so capture is decoupled from the output device's clock. + // + // Microphone.GetPosition cannot be trusted as a sample position on every platform. On + // macOS with a Bluetooth HFP headset, FMOD writes each real 20ms packet of clip.frequency + // audio, then advances the position counter ~3.2x too far and zero-fills the skipped + // range. The buffer then holds valid fragments of N samples at a stride J (measured: 320 + // of every 1024) and the counter rate is k = J/N times the data rate. Inspection of a raw + // buffer dump showed the fragments are consecutive speech that joins continuously, so the + // stream is reconstructed losslessly by reading only the first N = J/k samples of each + // stride. Healthy devices have k ~ 1 and use a plain contiguous read. + // + // The clip's data rate is clip.frequency (verified: fragments play at correct pitch), so + // captured samples are resampled from clip.frequency to the fixed native-source rate. + private const uint TargetSampleRate = 48000; + private const float PreRollSeconds = 0.3f; + private const double FragmentedKThreshold = 1.05; + private const float MaxBacklogSeconds = 0.2f; // drop backlog beyond this after a stall + private readonly string _deviceName; public override event Action AudioRead; private bool _disposed = false; private bool _started = false; + private volatile bool _capturing = false; + + // Streaming linear-resampler state (input = clip.frequency, output = TargetSampleRate). + private double _resamplePos; + private float _resamplePrev; /// /// Creates a new microphone source for the given device. /// /// The name of the device to capture from. Use to /// get the list of available devices. - /// The GameObject to attach the AudioSource to. The object must be kept in the scene - /// for the duration of the source's lifetime. - public MicrophoneSource(string deviceName, GameObject sourceObject) : base(2, RtcAudioSourceType.AudioSourceMicrophone) + /// Unused; retained for compatibility. The microphone clip is read + /// directly, so no scene GameObject/AudioSource is required. + public MicrophoneSource(string deviceName, GameObject sourceObject) + : base(RtcAudioSourceType.AudioSourceMicrophone, TargetSampleRate, 1) { _deviceName = deviceName; - _sourceObject = sourceObject; + } + + // The rate requested from Microphone.Start (a hint the platform may not honor), clamped to + // the device's reported range. The authoritative data rate is clip.frequency afterwards. + private static int ResolveRequestedSampleRate(string deviceName) + { + Microphone.GetDeviceCaps(deviceName, out int minFreq, out int maxFreq); + if (minFreq == 0 && maxFreq == 0) + return (int)TargetSampleRate; + return Mathf.Clamp((int)TargetSampleRate, minFreq, maxFreq); } /// @@ -49,7 +84,6 @@ public override void Start() base.Start(); if (_started) return; - if (!Application.HasUserAuthorization(mode: UserAuthorization.Microphone)) throw new InvalidOperationException("Microphone access not authorized"); @@ -61,13 +95,6 @@ public override void Start() private IEnumerator StartMicrophone() { - // Validate that the GameObject is still valid before starting - if (_sourceObject == null) - { - Utils.Error("MicrophoneSource: GameObject is null, cannot start microphone"); - yield break; - } - // Verify microphone is still authorized (could change during background) if (!Application.HasUserAuthorization(UserAuthorization.Microphone)) { @@ -76,13 +103,14 @@ private IEnumerator StartMicrophone() } AudioClip clip = null; + int requestedRate = ResolveRequestedSampleRate(_deviceName); try { clip = Microphone.Start( _deviceName, loop: true, - lengthSec: 1, - frequency: (int)DefaultMicrophoneSampleRate + lengthSec: 2, + frequency: requestedRate ); } catch (Exception e) @@ -97,29 +125,6 @@ private IEnumerator StartMicrophone() yield break; } - // Ensure no duplicate components exist before adding new ones. - // This is important during app resume on iOS where components might not be - // fully destroyed yet due to Unity's deferred Destroy(). - var existingSource = _sourceObject.GetComponent(); - if (existingSource != null) - UnityEngine.Object.DestroyImmediate(existingSource); - - var existingProbe = _sourceObject.GetComponent(); - if (existingProbe != null) - { - existingProbe.AudioRead -= OnAudioRead; - UnityEngine.Object.DestroyImmediate(existingProbe); - } - - var source = _sourceObject.AddComponent(); - source.clip = clip; - source.loop = true; - - var probe = _sourceObject.AddComponent(); - // Clear the audio data after it is read as to not play it through the speaker locally. - probe.ClearAfterInvocation(); - probe.AudioRead += OnAudioRead; - // Wait for microphone to actually start producing data with a timeout const float timeout = 2f; float elapsed = 0f; @@ -135,8 +140,155 @@ private IEnumerator StartMicrophone() yield break; } - source.Play(); - Utils.Debug($"MicrophoneSource device='{_deviceName}' started successfully"); + Utils.Info($"MicrophoneSource device='{_deviceName}' clip={clip.frequency}Hz/{clip.channels}ch samples={clip.samples} requested={requestedRate}Hz target={TargetSampleRate}Hz"); + + _capturing = true; + MonoBehaviourContext.RunCoroutine(CaptureLoop(clip)); + } + + // Reads new samples from the clip's ring buffer each frame and pushes them to the native + // source via AudioRead. Runs on the main thread; the native source's queue absorbs the + // per-frame pacing jitter. + private IEnumerator CaptureLoop(AudioClip clip) + { + int clipFrames = clip.samples; + int channels = clip.channels; + int dataRate = clip.frequency > 0 ? clip.frequency : (int)DefaultMicrophoneSampleRate; + + // Pre-roll: measure how fast the position counter advances (its average is steady even + // when individual values jump) and the size of its smallest discrete jump. + int prevCounter = Microphone.GetPosition(_deviceName); + long advance = 0; + long minJump = long.MaxValue; + var preRoll = System.Diagnostics.Stopwatch.StartNew(); + while (preRoll.Elapsed.TotalSeconds < PreRollSeconds) + { + if (!_capturing || _disposed) yield break; + yield return null; + int c = Microphone.GetPosition(_deviceName); + long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; + prevCounter = c; + advance += d; + if (d > 0 && d < minJump) minJump = d; + } + if (!_capturing || _disposed) yield break; + + double counterRate = advance > 0 ? advance / preRoll.Elapsed.TotalSeconds : dataRate; + double k = counterRate / dataRate; + + // Fragmented mode: the counter advances in jumps of `stride`, but only the first + // `validPerStride` samples of each stride contain data; the rest is zero padding. + bool fragmented = k > FragmentedKThreshold && minJump != long.MaxValue && minJump > 1; + int stride = fragmented ? (int)minJump : 0; + int validPerStride = fragmented ? Math.Max(1, (int)Math.Round(stride / k)) : 0; + + if (fragmented) + Utils.Info($"MicrophoneSource: fragmented clip detected (k={k:F2}); reading {validPerStride} of every {stride} samples at {dataRate}Hz"); + else + Utils.Info($"MicrophoneSource: contiguous capture (k={k:F2}) at {dataRate}Hz"); + + _resamplePos = 0.0; + _resamplePrev = 0f; + long maxBacklog = (long)(counterRate * MaxBacklogSeconds); + int readPos = prevCounter; // counter values land on jump boundaries + long pending = 0; + + while (_capturing && !_disposed) + { + yield return null; + + int c = Microphone.GetPosition(_deviceName); + long d = ((c - prevCounter) % clipFrames + clipFrames) % clipFrames; + prevCounter = c; + pending += d; + + // After a long stall, drop the oldest backlog instead of pushing a burst that + // would overrun the native source's queue. + if (pending > maxBacklog) + { + long drop = pending - maxBacklog; + if (fragmented) drop -= drop % stride; // preserve stride alignment + readPos = (int)((readPos + drop) % clipFrames); + pending -= drop; + Utils.Warning($"MicrophoneSource: dropped {drop} buffered samples after a stall"); + } + + if (fragmented) + { + while (pending >= stride) + { + EmitClipRange(clip, channels, dataRate, readPos, validPerStride, clipFrames); + readPos = (readPos + stride) % clipFrames; + pending -= stride; + } + } + else if (pending > 0) + { + EmitClipRange(clip, channels, dataRate, readPos, (int)pending, clipFrames); + readPos = (int)((readPos + pending) % clipFrames); + pending = 0; + } + } + } + + // Reads `count` frames starting at `start`, splitting at the ring wrap so each GetData + // read is contiguous. + private void EmitClipRange(AudioClip clip, int channels, int dataRate, int start, int count, int clipFrames) + { + if (count <= 0) return; + int first = Math.Min(count, clipFrames - start); + ReadAndPush(clip, channels, dataRate, start, first); + if (count > first) + ReadAndPush(clip, channels, dataRate, 0, count - first); + } + + // Reads a contiguous range, downmixes to mono, resamples dataRate -> TargetSampleRate + // (streaming linear interpolation carrying state across calls, so fragment junctions stay + // continuous), and fires AudioRead. + private void ReadAndPush(AudioClip clip, int channels, int dataRate, int start, int count) + { + if (count <= 0) return; + + var interleaved = new float[count * channels]; + clip.GetData(interleaved, start); + + float[] mono; + if (channels == 1) + { + mono = interleaved; + } + else + { + mono = new float[count]; + for (int f = 0; f < count; f++) + { + float sum = 0f; + for (int ch = 0; ch < channels; ch++) + sum += interleaved[f * channels + ch]; + mono[f] = sum / channels; + } + } + + double step = (double)dataRate / TargetSampleRate; + var output = new List((int)(count / step) + 2); + + // Index -1 maps to the carried last sample of the previous chunk so interpolation is + // continuous across chunk boundaries. pos stays >= -1. + double pos = _resamplePos; + while (pos < count - 1) + { + int i0 = (int)Math.Floor(pos); + float a = i0 < 0 ? _resamplePrev : mono[i0]; + float b = mono[i0 + 1]; + float frac = (float)(pos - i0); + output.Add(a * (1f - frac) + b * frac); + pos += step; + } + _resamplePrev = mono[count - 1]; + _resamplePos = pos - count; + + if (output.Count > 0) + AudioRead?.Invoke(output.ToArray(), 1, (int)TargetSampleRate); } /// @@ -152,33 +304,15 @@ public override void Stop() private IEnumerator StopMicrophone() { + _capturing = false; + if (Microphone.IsRecording(_deviceName)) Microphone.End(_deviceName); - // Check if GameObject is still valid before trying to access components - if (_sourceObject != null) - { - var probe = _sourceObject.GetComponent(); - if (probe != null) - { - probe.AudioRead -= OnAudioRead; - UnityEngine.Object.Destroy(probe); - } - - var source = _sourceObject.GetComponent(); - if (source != null) - UnityEngine.Object.Destroy(source); - } - Utils.Debug($"MicrophoneSource device='{_deviceName}' stopped"); yield return null; } - private void OnAudioRead(float[] data, int channels, int sampleRate) - { - AudioRead?.Invoke(data, channels, sampleRate); - } - private void OnApplicationPause(bool pause) { if (!_started) @@ -246,4 +380,4 @@ protected override void Dispose(bool disposing) Dispose(false); } } -} \ No newline at end of file +} diff --git a/Runtime/Scripts/RtcAudioSource.cs b/Runtime/Scripts/RtcAudioSource.cs index a9af8a0a..c85e5a2d 100644 --- a/Runtime/Scripts/RtcAudioSource.cs +++ b/Runtime/Scripts/RtcAudioSource.cs @@ -83,20 +83,42 @@ private sealed class PendingAudioFrame private volatile bool _disposed = false; private int _audioReadCount = 0; - protected RtcAudioSource(int channels = 2, RtcAudioSourceType audioSourceType = RtcAudioSourceType.AudioSourceCustom) + // --- Temporary capture-rate diagnostics (Info level, emitted ~every 2s) --- + // Measures the effective sample rate from wall-clock time vs the rate we declared to the + // native source. A measured rate that differs from the declared rate means the format + // label on the frames is wrong (audio would sound fast/slow/choppy on the receiver). + private long _diagWindowStartTicks; // 0 = not started + private long _diagSamplesPerChannel; + private int _diagAcceptedFrames; + private int _diagDroppedFrames; + + // Device-capture sources (microphone, AudioSource taps) don't know their format ahead of + // time — it is whatever Unity's audio graph delivers. They use this constructor, which + // configures the native source from Unity's current output configuration. + protected RtcAudioSource(RtcAudioSourceType audioSourceType) + : this(audioSourceType, 0, 0) { } + + // Sources that generate a fixed, known format (e.g. test signal generators) declare it + // directly. Passing 0 for either value falls back to the device configuration. + protected RtcAudioSource(RtcAudioSourceType audioSourceType, uint sampleRate, uint channels) { _sourceType = audioSourceType; - _expectedChannels = (uint)channels; + + if (sampleRate > 0 && channels > 0) + { + _expectedSampleRate = sampleRate; + _expectedChannels = channels; + } + else + { + (_expectedSampleRate, _expectedChannels) = ResolveDeviceFormat(); + } using var request = FFIBridge.Instance.NewRequest(); var newAudioSource = request.request; newAudioSource.Type = AudioSourceType.AudioSourceNative; - newAudioSource.NumChannels = (uint)channels; - newAudioSource.SampleRate = _sourceType == RtcAudioSourceType.AudioSourceMicrophone ? - DefaultMicrophoneSampleRate : DefaultSampleRate; - _expectedSampleRate = newAudioSource.SampleRate; - - Utils.Debug($"NewAudioSource: {newAudioSource.NumChannels} {newAudioSource.SampleRate}"); + newAudioSource.NumChannels = _expectedChannels; + newAudioSource.SampleRate = _expectedSampleRate; newAudioSource.Options = request.TempResource(); newAudioSource.Options.EchoCancellation = true; @@ -109,6 +131,49 @@ protected RtcAudioSource(int channels = 2, RtcAudioSourceType audioSourceType = Utils.Debug($"{DebugTag} created handle={Handle.DangerousGetHandle()} expectedRate={_expectedSampleRate} expectedChannels={_expectedChannels} sourceType={_sourceType}"); } + // Reads Unity's actual output audio configuration. The capture path delivers buffers at the + // DSP output rate/channel count (see AudioProbe), so this is the format the native source + // must match. Falls back to the platform defaults when Unity cannot report a configuration + // (e.g. batch mode without an audio device). + private (uint sampleRate, uint channels) ResolveDeviceFormat() + { + uint sampleRate = _sourceType == RtcAudioSourceType.AudioSourceMicrophone + ? DefaultMicrophoneSampleRate + : DefaultSampleRate; + uint channels = DefaultChannels; + + try + { + var config = UnityEngine.AudioSettings.GetConfiguration(); + if (config.sampleRate > 0) + sampleRate = (uint)config.sampleRate; + var configuredChannels = SpeakerModeChannels(config.speakerMode); + if (configuredChannels > 0) + channels = configuredChannels; + } + catch (Exception e) + { + Utils.Warning($"{DebugTag} could not read Unity audio configuration, using defaults: {e.Message}"); + } + + return (sampleRate, channels); + } + + private static uint SpeakerModeChannels(UnityEngine.AudioSpeakerMode mode) + { + switch (mode) + { + case UnityEngine.AudioSpeakerMode.Mono: return 1; + case UnityEngine.AudioSpeakerMode.Stereo: return 2; + case UnityEngine.AudioSpeakerMode.Quad: return 4; + case UnityEngine.AudioSpeakerMode.Surround: return 5; + case UnityEngine.AudioSpeakerMode.Mode5point1: return 6; + case UnityEngine.AudioSpeakerMode.Mode7point1: return 8; + case UnityEngine.AudioSpeakerMode.Prologic: return 2; + default: return 0; + } + } + /// /// Begin capturing audio samples from the underlying source. /// @@ -153,9 +218,19 @@ private void OnAudioRead(float[] data, int channels, int sampleRate) return; } + var willDrop = (uint)sampleRate != _expectedSampleRate || (uint)channels != _expectedChannels; + RecordCaptureDiagnostics(data.Length / channels, channels, sampleRate, willDrop); + + // The native source rejects frames whose rate/channels differ from how it was + // configured (it does not resample). This should not happen now that the source is + // configured from the device, but if Unity reports an inconsistent format — or the + // output configuration changes at runtime — we drop the frame instead of sending a + // mismatch the native side would error on. if ((uint)sampleRate != _expectedSampleRate || (uint)channels != _expectedChannels) { - Utils.Warning($"{DebugTag} audio frame #{frameIndex} metadata mismatch actualRate={sampleRate} actualChannels={channels} expectedRate={_expectedSampleRate} expectedChannels={_expectedChannels} sourceType={_sourceType}"); + if (frameIndex == 1 || frameIndex % 100 == 0) + Utils.Warning($"{DebugTag} dropping audio frame #{frameIndex}: format {sampleRate}/{channels} does not match source {_expectedSampleRate}/{_expectedChannels} (sourceType={_sourceType})"); + return; } var pendingBeforeSend = PendingFrameCount(); @@ -342,6 +417,28 @@ private static double ElapsedMilliseconds(long startedTimestamp) return (Stopwatch.GetTimestamp() - startedTimestamp) * 1000.0 / Stopwatch.Frequency; } + // Temporary diagnostic: accumulates captured audio over wall-clock time and, ~every 2s, + // logs the effective sample rate vs the rate declared to the native source. Runs on the + // audio thread; the periodic Info log is cheap. + private void RecordCaptureDiagnostics(int samplesPerChannel, int channels, int sampleRate, bool dropped) + { + var now = Stopwatch.GetTimestamp(); + if (_diagWindowStartTicks == 0) _diagWindowStartTicks = now; + _diagSamplesPerChannel += samplesPerChannel; + if (dropped) _diagDroppedFrames++; else _diagAcceptedFrames++; + + var elapsed = (now - _diagWindowStartTicks) / (double)Stopwatch.Frequency; + if (elapsed < 2.0) return; + + var measuredRate = _diagSamplesPerChannel / elapsed; + Utils.Info($"{DebugTag} capture diag: declared={_expectedSampleRate}Hz/{_expectedChannels}ch measuredRate={measuredRate:F0}Hz " + + $"lastFrame={samplesPerChannel}smp/{channels}ch/{sampleRate}Hz accepted={_diagAcceptedFrames} dropped={_diagDroppedFrames} over={elapsed:F1}s"); + _diagWindowStartTicks = now; + _diagSamplesPerChannel = 0; + _diagAcceptedFrames = 0; + _diagDroppedFrames = 0; + } + private string DebugTag => $"RtcAudioSource#{_debugId}"; } } diff --git a/Samples~/Meet/Assets/Runtime/MeetManager.cs b/Samples~/Meet/Assets/Runtime/MeetManager.cs index 225c7a0c..97b2cb70 100644 --- a/Samples~/Meet/Assets/Runtime/MeetManager.cs +++ b/Samples~/Meet/Assets/Runtime/MeetManager.cs @@ -453,8 +453,7 @@ private IEnumerator PublishLocalMicrophone() { if (_audioObjects.ContainsKey(LocalAudioTrackName)) yield break; - Microphone.Start(null, true, 10, 44100); - + // MicrophoneSource starts the device itself, so we only need the device name here. var audioObject = new GameObject($"My Microphone: {Microphone.devices[0]}"); audioObject.transform.SetParent(_audioTrackParent); diff --git a/Tests/PlayMode/Utils/SineWaveAudioSource.cs b/Tests/PlayMode/Utils/SineWaveAudioSource.cs index 907e9ccc..2337615b 100644 --- a/Tests/PlayMode/Utils/SineWaveAudioSource.cs +++ b/Tests/PlayMode/Utils/SineWaveAudioSource.cs @@ -31,7 +31,7 @@ public SineWaveAudioSource( int sampleRate = 48000, double frequencyHz = 440.0, float amplitude = 0.1f) - : base(channels, RtcAudioSourceType.AudioSourceCustom) + : base(RtcAudioSourceType.AudioSourceCustom, (uint)sampleRate, (uint)channels) { _channels = channels; _sampleRate = sampleRate;