Merge pull request #4194 from MerryMage/audiofifo

audio_core: Simplify sink interface
2025-09-13 23:17:07 +00:00 · 2018-09-21 13:30:51 +01:00 · 2018-09-21 13:30:51 +01:00 · bb9e92c77c
parent 687e3e74ca c9c7097769
commit bb9e92c77c
12 changed files with 316 additions and 314 deletions
--- a/src/audio_core/cubeb_sink.cpp
+++ b/src/audio_core/cubeb_sink.cpp
@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <cstdarg>
 #include <mutex>
 #include <vector>
 #include <cubeb/cubeb.h>
@ -13,17 +14,16 @@ namespace AudioCore {

 struct CubebSink::Impl {
    unsigned int sample_rate = 0;
-    std::vector<std::string> device_list;

    cubeb* ctx = nullptr;
    cubeb_stream* stream = nullptr;

-    std::mutex queue_mutex;
-    std::vector<s16> queue;
+    std::function<void(s16*, std::size_t)> cb;

    static long DataCallback(cubeb_stream* stream, void* user_data, const void* input_buffer,
                             void* output_buffer, long num_frames);
    static void StateCallback(cubeb_stream* stream, void* user_data, cubeb_state state);
+    static void LogCallback(char const* fmt, ...);
 };

 CubebSink::CubebSink(std::string target_device_name) : impl(std::make_unique<Impl>()) {
@ -31,21 +31,23 @@ CubebSink::CubebSink(std::string target_device_name) : impl(std::make_unique<Imp
        LOG_CRITICAL(Audio_Sink, "cubeb_init failed");
        return;
    }
-
-    cubeb_devid output_device = nullptr;
-
-    cubeb_stream_params params;
-    params.rate = native_sample_rate;
-    params.channels = 2;
-    params.format = CUBEB_SAMPLE_S16NE;
-    params.layout = CUBEB_LAYOUT_STEREO;
+    cubeb_set_log_callback(CUBEB_LOG_NORMAL, &Impl::LogCallback);

    impl->sample_rate = native_sample_rate;

-    u32 minimum_latency = 0;
-    if (cubeb_get_min_latency(impl->ctx, &params, &minimum_latency) != CUBEB_OK)
-        LOG_CRITICAL(Audio_Sink, "Error getting minimum latency");
+    cubeb_stream_params params;
+    params.rate = impl->sample_rate;
+    params.channels = 2;
+    params.layout = CUBEB_LAYOUT_STEREO;
+    params.format = CUBEB_SAMPLE_S16NE;
+    params.prefs = CUBEB_STREAM_PREF_NONE;

+    u32 minimum_latency = 100 * impl->sample_rate / 1000; // Firefox default
+    if (cubeb_get_min_latency(impl->ctx, &params, &minimum_latency) != CUBEB_OK) {
+        LOG_CRITICAL(Audio_Sink, "Error getting minimum latency");
+    }
+
+    cubeb_devid output_device = nullptr;
    if (target_device_name != auto_device_name && !target_device_name.empty()) {
        cubeb_device_collection collection;
        if (cubeb_enumerate_devices(impl->ctx, CUBEB_DEVICE_TYPE_OUTPUT, &collection) != CUBEB_OK) {
@ -63,10 +65,22 @@ CubebSink::CubebSink(std::string target_device_name) : impl(std::make_unique<Imp
        }
    }

-    if (cubeb_stream_init(impl->ctx, &impl->stream, "Citra Audio Output", nullptr, nullptr,
-                          output_device, &params, std::max(512u, minimum_latency),
-                          &Impl::DataCallback, &Impl::StateCallback, impl.get()) != CUBEB_OK) {
-        LOG_CRITICAL(Audio_Sink, "Error initializing cubeb stream");
+    int stream_err = cubeb_stream_init(impl->ctx, &impl->stream, "CitraAudio", nullptr, nullptr,
+                                       output_device, &params, std::max(512u, minimum_latency),
+                                       &Impl::DataCallback, &Impl::StateCallback, impl.get());
+    if (stream_err != CUBEB_OK) {
+        switch (stream_err) {
+        case CUBEB_ERROR:
+        default:
+            LOG_CRITICAL(Audio_Sink, "Error initializing cubeb stream ({})", stream_err);
+            break;
+        case CUBEB_ERROR_INVALID_FORMAT:
+            LOG_CRITICAL(Audio_Sink, "Invalid format when initializing cubeb stream");
+            break;
+        case CUBEB_ERROR_DEVICE_UNAVAILABLE:
+            LOG_CRITICAL(Audio_Sink, "Device unavailable when initializing cubeb stream");
+            break;
+        }
        return;
    }

@ -77,8 +91,11 @@ CubebSink::CubebSink(std::string target_device_name) : impl(std::make_unique<Imp
 }

 CubebSink::~CubebSink() {
-    if (!impl->ctx)
+    if (!impl->ctx) {
        return;
+    }
+
+    impl->cb = nullptr;

    if (cubeb_stream_stop(impl->stream) != CUBEB_OK) {
        LOG_CRITICAL(Audio_Sink, "Error stopping cubeb stream");
@ -95,56 +112,62 @@ unsigned int CubebSink::GetNativeSampleRate() const {
    return impl->sample_rate;
 }

-void CubebSink::EnqueueSamples(const s16* samples, std::size_t sample_count) {
-    if (!impl->ctx)
-        return;
-
-    std::lock_guard lock{impl->queue_mutex};
-
-    impl->queue.reserve(impl->queue.size() + sample_count * 2);
-    std::copy(samples, samples + sample_count * 2, std::back_inserter(impl->queue));
-}
-
-size_t CubebSink::SamplesInQueue() const {
-    if (!impl->ctx)
-        return 0;
-
-    std::lock_guard lock{impl->queue_mutex};
-    return impl->queue.size() / 2;
+void CubebSink::SetCallback(std::function<void(s16*, std::size_t)> cb) {
+    impl->cb = cb;
 }

 long CubebSink::Impl::DataCallback(cubeb_stream* stream, void* user_data, const void* input_buffer,
                                   void* output_buffer, long num_frames) {
    Impl* impl = static_cast<Impl*>(user_data);
-    u8* buffer = reinterpret_cast<u8*>(output_buffer);
+    s16* buffer = reinterpret_cast<s16*>(output_buffer);

-    if (!impl)
-        return 0;
-
-    std::lock_guard lock{impl->queue_mutex};
-
-    std::size_t frames_to_write =
-        std::min(impl->queue.size() / 2, static_cast<std::size_t>(num_frames));
-
-    memcpy(buffer, impl->queue.data(), frames_to_write * sizeof(s16) * 2);
-    impl->queue.erase(impl->queue.begin(), impl->queue.begin() + frames_to_write * 2);
-
-    if (frames_to_write < num_frames) {
-        // Fill the rest of the frames with silence
-        memset(buffer + frames_to_write * sizeof(s16) * 2, 0,
-               (num_frames - frames_to_write) * sizeof(s16) * 2);
+    if (!impl || !impl->cb) {
+        LOG_DEBUG(Audio_Sink, "Emitting zeros");
+        std::memset(output_buffer, 0, num_frames * 2 * sizeof(s16));
+        return num_frames;
    }

+    impl->cb(buffer, num_frames);
+
    return num_frames;
 }

-void CubebSink::Impl::StateCallback(cubeb_stream* stream, void* user_data, cubeb_state state) {}
+void CubebSink::Impl::StateCallback(cubeb_stream* stream, void* user_data, cubeb_state state) {
+    switch (state) {
+    case CUBEB_STATE_STARTED:
+        LOG_INFO(Audio_Sink, "Cubeb Audio Stream Started");
+        break;
+    case CUBEB_STATE_STOPPED:
+        LOG_INFO(Audio_Sink, "Cubeb Audio Stream Stopped");
+        break;
+    case CUBEB_STATE_DRAINED:
+        LOG_INFO(Audio_Sink, "Cubeb Audio Stream Drained");
+        break;
+    case CUBEB_STATE_ERROR:
+        LOG_CRITICAL(Audio_Sink, "Cubeb Audio Stream Errored");
+        break;
+    }
+}
+
+void CubebSink::Impl::LogCallback(char const* format, ...) {
+    std::array<char, 512> buffer;
+    std::va_list args;
+    va_start(args, format);
+#ifdef _MSC_VER
+    vsprintf_s(buffer.data(), buffer.size(), format, args);
+#else
+    vsnprintf(buffer.data(), buffer.size(), format, args);
+#endif
+    va_end(args);
+    buffer.back() = '\0';
+    LOG_INFO(Audio_Sink, "{}", buffer.data());
+}

 std::vector<std::string> ListCubebSinkDevices() {
    std::vector<std::string> device_list;
    cubeb* ctx;

-    if (cubeb_init(&ctx, "Citra Device Enumerator", nullptr) != CUBEB_OK) {
+    if (cubeb_init(&ctx, "CitraEnumerator", nullptr) != CUBEB_OK) {
        LOG_CRITICAL(Audio_Sink, "cubeb_init failed");
        return {};
    }
--- a/src/audio_core/cubeb_sink.h
+++ b/src/audio_core/cubeb_sink.h
@ -17,9 +17,7 @@ public:

    unsigned int GetNativeSampleRate() const override;

-    void EnqueueSamples(const s16* samples, std::size_t sample_count) override;
-
-    std::size_t SamplesInQueue() const override;
+    void SetCallback(std::function<void(s16*, std::size_t)> cb) override;

 private:
    struct Impl;
--- a/src/audio_core/dsp_interface.cpp
+++ b/src/audio_core/dsp_interface.cpp
@ -12,16 +12,13 @@
 namespace AudioCore {

 DspInterface::DspInterface() = default;
-
-DspInterface::~DspInterface() {
-    if (perform_time_stretching) {
-        FlushResidualStretcherAudio();
-    }
-}
+DspInterface::~DspInterface() = default;

 void DspInterface::SetSink(const std::string& sink_id, const std::string& audio_device) {
    const SinkDetails& sink_details = GetSinkDetails(sink_id);
    sink = sink_details.factory(audio_device);
+    sink->SetCallback(
+        [this](s16* buffer, std::size_t num_frames) { OutputCallback(buffer, num_frames); });
    time_stretcher.SetOutputSampleRate(sink->GetNativeSampleRate());
 }

@ -35,7 +32,7 @@ void DspInterface::EnableStretching(bool enable) {
        return;

    if (!enable) {
-        FlushResidualStretcherAudio();
+        flushing_time_stretcher = true;
    }
    perform_time_stretching = enable;
 }
@ -44,39 +41,41 @@ void DspInterface::OutputFrame(StereoFrame16& frame) {
    if (!sink)
        return;

-    // Implementation of the hardware volume slider with a dynamic range of 60 dB
-    double volume_scale_factor = std::exp(6.90775 * Settings::values.volume) * 0.001;
-    for (std::size_t i = 0; i < frame.size(); i++) {
-        frame[i][0] = static_cast<s16>(frame[i][0] * volume_scale_factor);
-        frame[i][1] = static_cast<s16>(frame[i][1] * volume_scale_factor);
-    }
-
-    if (perform_time_stretching) {
-        time_stretcher.AddSamples(&frame[0][0], frame.size());
-        std::vector<s16> stretched_samples = time_stretcher.Process(sink->SamplesInQueue());
-        sink->EnqueueSamples(stretched_samples.data(), stretched_samples.size() / 2);
-    } else {
-        constexpr std::size_t maximum_sample_latency = 2048; // about 64 miliseconds
-        if (sink->SamplesInQueue() > maximum_sample_latency) {
-            // This can occur if we're running too fast and samples are starting to back up.
-            // Just drop the samples.
-            return;
-        }
-
-        sink->EnqueueSamples(&frame[0][0], frame.size());
-    }
+    fifo.Push(frame.data(), frame.size());
 }

-void DspInterface::FlushResidualStretcherAudio() {
-    if (!sink)
-        return;
+void DspInterface::OutputCallback(s16* buffer, std::size_t num_frames) {
+    std::size_t frames_written;
+    if (perform_time_stretching) {
+        const std::vector<s16> in{fifo.Pop()};
+        const std::size_t num_in{in.size() / 2};
+        frames_written = time_stretcher.Process(in.data(), num_in, buffer, num_frames);
+    } else if (flushing_time_stretcher) {
+        time_stretcher.Flush();
+        frames_written = time_stretcher.Process(nullptr, 0, buffer, num_frames);
+        frames_written += fifo.Pop(buffer, num_frames - frames_written);
+        flushing_time_stretcher = false;
+    } else {
+        frames_written = fifo.Pop(buffer, num_frames);
+    }

-    time_stretcher.Flush();
-    while (true) {
-        std::vector<s16> residual_audio = time_stretcher.Process(sink->SamplesInQueue());
-        if (residual_audio.empty())
-            break;
-        sink->EnqueueSamples(residual_audio.data(), residual_audio.size() / 2);
+    if (frames_written > 0) {
+        std::memcpy(&last_frame[0], buffer + 2 * (frames_written - 1), 2 * sizeof(s16));
+    }
+
+    // Hold last emitted frame; this prevents popping.
+    for (std::size_t i = frames_written; i < num_frames; i++) {
+        std::memcpy(buffer + 2 * i, &last_frame[0], 2 * sizeof(s16));
+    }
+
+    // Implementation of the hardware volume slider with a dynamic range of 60 dB
+    const float linear_volume = std::clamp(Settings::values.volume, 0.0f, 1.0f);
+    if (linear_volume != 1.0) {
+        const float volume_scale_factor = std::exp(6.90775f * linear_volume) * 0.001f;
+        for (std::size_t i = 0; i < num_frames; i++) {
+            buffer[i * 2 + 0] = static_cast<s16>(buffer[i * 2 + 0] * volume_scale_factor);
+            buffer[i * 2 + 1] = static_cast<s16>(buffer[i * 2 + 1] * volume_scale_factor);
+        }
    }
 }

--- a/src/audio_core/dsp_interface.h
+++ b/src/audio_core/dsp_interface.h
@ -9,6 +9,7 @@
 #include "audio_core/audio_types.h"
 #include "audio_core/time_stretch.h"
 #include "common/common_types.h"
+#include "common/ring_buffer.h"
 #include "core/memory.h"

 namespace Service {
@ -81,9 +82,13 @@ protected:

 private:
    void FlushResidualStretcherAudio();
+    void OutputCallback(s16* buffer, std::size_t num_frames);

    std::unique_ptr<Sink> sink;
-    bool perform_time_stretching = false;
+    std::atomic<bool> perform_time_stretching = false;
+    std::atomic<bool> flushing_time_stretcher = false;
+    Common::RingBuffer<s16, 0x2000, 2> fifo;
+    std::array<s16, 2> last_frame{};
    TimeStretcher time_stretcher;
 };

--- a/src/audio_core/null_sink.h
+++ b/src/audio_core/null_sink.h
@ -19,11 +19,7 @@ public:
        return native_sample_rate;
    }

-    void EnqueueSamples(const s16*, std::size_t) override {}
-
-    std::size_t SamplesInQueue() const override {
-        return 0;
-    }
+    void SetCallback(std::function<void(s16*, std::size_t)>) override {}
 };

 } // namespace AudioCore
--- a/src/audio_core/sdl2_sink.cpp
+++ b/src/audio_core/sdl2_sink.cpp
@ -2,8 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <list>
-#include <numeric>
+#include <string>
+#include <vector>
 #include <SDL.h>
 #include "audio_core/audio_types.h"
 #include "audio_core/sdl2_sink.h"
@ -17,7 +17,7 @@ struct SDL2Sink::Impl {

    SDL_AudioDeviceID audio_device_id = 0;

-    std::list<std::vector<s16>> queue;
+    std::function<void(s16*, std::size_t)> cb;

    static void Callback(void* impl_, u8* buffer, int buffer_size_in_bytes);
 };
@ -74,58 +74,18 @@ unsigned int SDL2Sink::GetNativeSampleRate() const {
    return impl->sample_rate;
 }

-void SDL2Sink::EnqueueSamples(const s16* samples, std::size_t sample_count) {
-    if (impl->audio_device_id <= 0)
-        return;
-
-    SDL_LockAudioDevice(impl->audio_device_id);
-    impl->queue.emplace_back(samples, samples + sample_count * 2);
-    SDL_UnlockAudioDevice(impl->audio_device_id);
-}
-
-size_t SDL2Sink::SamplesInQueue() const {
-    if (impl->audio_device_id <= 0)
-        return 0;
-
-    SDL_LockAudioDevice(impl->audio_device_id);
-
-    std::size_t total_size =
-        std::accumulate(impl->queue.begin(), impl->queue.end(), static_cast<std::size_t>(0),
-                        [](std::size_t sum, const auto& buffer) {
-                            // Division by two because each stereo sample is made of
-                            // two s16.
-                            return sum + buffer.size() / 2;
-                        });
-
-    SDL_UnlockAudioDevice(impl->audio_device_id);
-
-    return total_size;
+void SDL2Sink::SetCallback(std::function<void(s16*, std::size_t)> cb) {
+    impl->cb = cb;
 }

 void SDL2Sink::Impl::Callback(void* impl_, u8* buffer, int buffer_size_in_bytes) {
    Impl* impl = reinterpret_cast<Impl*>(impl_);
+    if (!impl || !impl->cb)
+        return;

-    std::size_t remaining_size = static_cast<std::size_t>(buffer_size_in_bytes) /
-                                 sizeof(s16); // Keep track of size in 16-bit increments.
+    const size_t num_frames = buffer_size_in_bytes / (2 * sizeof(s16));

-    while (remaining_size > 0 && !impl->queue.empty()) {
-        if (impl->queue.front().size() <= remaining_size) {
-            memcpy(buffer, impl->queue.front().data(), impl->queue.front().size() * sizeof(s16));
-            buffer += impl->queue.front().size() * sizeof(s16);
-            remaining_size -= impl->queue.front().size();
-            impl->queue.pop_front();
-        } else {
-            memcpy(buffer, impl->queue.front().data(), remaining_size * sizeof(s16));
-            buffer += remaining_size * sizeof(s16);
-            impl->queue.front().erase(impl->queue.front().begin(),
-                                      impl->queue.front().begin() + remaining_size);
-            remaining_size = 0;
-        }
-    }
-
-    if (remaining_size > 0) {
-        memset(buffer, 0, remaining_size * sizeof(s16));
-    }
+    impl->cb(reinterpret_cast<s16*>(buffer), num_frames);
 }

 std::vector<std::string> ListSDL2SinkDevices() {
--- a/src/audio_core/sdl2_sink.h
+++ b/src/audio_core/sdl2_sink.h
@ -17,9 +17,7 @@ public:

    unsigned int GetNativeSampleRate() const override;

-    void EnqueueSamples(const s16* samples, std::size_t sample_count) override;
-
-    std::size_t SamplesInQueue() const override;
+    void SetCallback(std::function<void(s16*, std::size_t)> cb) override;

 private:
    struct Impl;
--- a/src/audio_core/sink.h
+++ b/src/audio_core/sink.h
@ -4,7 +4,7 @@

 #pragma once

-#include <vector>
+#include <functional>
 #include "common/common_types.h"

 namespace AudioCore {
@ -20,19 +20,16 @@ class Sink {
 public:
    virtual ~Sink() = default;

-    /// The native rate of this sink. The sink expects to be fed samples that respect this. (Units:
-    /// samples/sec)
+    /// The native rate of this sink. The sink expects to be fed samples that respect this.
+    /// (Units: samples/sec)
    virtual unsigned int GetNativeSampleRate() const = 0;

    /**
-     * Feed stereo samples to sink.
+     * Set callback for samples
     * @param samples Samples in interleaved stereo PCM16 format.
     * @param sample_count Number of samples.
     */
-    virtual void EnqueueSamples(const s16* samples, std::size_t sample_count) = 0;
-
-    /// Samples enqueued that have not been played yet.
-    virtual std::size_t SamplesInQueue() const = 0;
+    virtual void SetCallback(std::function<void(s16*, std::size_t)> cb) = 0;
 };

 } // namespace AudioCore
--- a/src/audio_core/time_stretch.cpp
+++ b/src/audio_core/time_stretch.cpp
@ -3,143 +3,75 @@
 // Refer to the license.txt file included.

 #include <algorithm>
-#include <chrono>
 #include <cmath>
-#include <vector>
+#include <cstddef>
+#include <memory>
 #include <SoundTouch.h>
 #include "audio_core/audio_types.h"
 #include "audio_core/time_stretch.h"
-#include "common/common_types.h"
 #include "common/logging/log.h"

-using steady_clock = std::chrono::steady_clock;
-
 namespace AudioCore {

-constexpr double MIN_RATIO = 0.1;
-constexpr double MAX_RATIO = 100.0;
-
-static double ClampRatio(double ratio) {
-    return std::clamp(ratio, MIN_RATIO, MAX_RATIO);
+TimeStretcher::TimeStretcher()
+    : sample_rate(native_sample_rate), sound_touch(std::make_unique<soundtouch::SoundTouch>()) {
+    sound_touch->setChannels(2);
+    sound_touch->setSampleRate(native_sample_rate);
+    sound_touch->setPitch(1.0);
+    sound_touch->setTempo(1.0);
 }

-constexpr double MIN_DELAY_TIME = 0.05;                 // Units: seconds
-constexpr double MAX_DELAY_TIME = 0.25;                 // Units: seconds
-constexpr std::size_t DROP_FRAMES_SAMPLE_DELAY = 16000; // Units: samples
-
-constexpr double SMOOTHING_FACTOR = 0.007;
-
-struct TimeStretcher::Impl {
-    soundtouch::SoundTouch soundtouch;
-
-    steady_clock::time_point frame_timer = steady_clock::now();
-    std::size_t samples_queued = 0;
-
-    double smoothed_ratio = 1.0;
-
-    double sample_rate = static_cast<double>(native_sample_rate);
-};
-
-std::vector<s16> TimeStretcher::Process(std::size_t samples_in_queue) {
-    // This is a very simple algorithm without any fancy control theory. It works and is stable.
-
-    double ratio = CalculateCurrentRatio();
-    ratio = CorrectForUnderAndOverflow(ratio, samples_in_queue);
-    impl->smoothed_ratio =
-        (1.0 - SMOOTHING_FACTOR) * impl->smoothed_ratio + SMOOTHING_FACTOR * ratio;
-    impl->smoothed_ratio = ClampRatio(impl->smoothed_ratio);
-
-    // SoundTouch's tempo definition the inverse of our ratio definition.
-    impl->soundtouch.setTempo(1.0 / impl->smoothed_ratio);
-
-    std::vector<s16> samples = GetSamples();
-    if (samples_in_queue >= DROP_FRAMES_SAMPLE_DELAY) {
-        samples.clear();
-        LOG_DEBUG(Audio, "Dropping frames!");
-    }
-    return samples;
-}
-
-TimeStretcher::TimeStretcher() : impl(std::make_unique<Impl>()) {
-    impl->soundtouch.setPitch(1.0);
-    impl->soundtouch.setChannels(2);
-    impl->soundtouch.setSampleRate(native_sample_rate);
-    Reset();
-}
-
-TimeStretcher::~TimeStretcher() {
-    impl->soundtouch.clear();
-}
+TimeStretcher::~TimeStretcher() = default;

 void TimeStretcher::SetOutputSampleRate(unsigned int sample_rate) {
-    impl->sample_rate = static_cast<double>(sample_rate);
-    impl->soundtouch.setRate(static_cast<double>(native_sample_rate) / impl->sample_rate);
+    sound_touch->setSampleRate(sample_rate);
+    sample_rate = native_sample_rate;
 }

-void TimeStretcher::AddSamples(const s16* buffer, std::size_t num_samples) {
-    impl->soundtouch.putSamples(buffer, static_cast<uint>(num_samples));
-    impl->samples_queued += num_samples;
+std::size_t TimeStretcher::Process(const s16* in, std::size_t num_in, s16* out,
+                                   std::size_t num_out) {
+    const double time_delta = static_cast<double>(num_out) / sample_rate; // seconds
+    double current_ratio = static_cast<double>(num_in) / static_cast<double>(num_out);
+
+    const double max_latency = 0.25; // seconds
+    const double max_backlog = sample_rate * max_latency;
+    const double backlog_fullness = sound_touch->numSamples() / max_backlog;
+    if (backlog_fullness > 4.0) {
+        // Too many samples in backlog: Don't push anymore on
+        num_in = 0;
+    }
+
+    // We ideally want the backlog to be about 50% full.
+    // This gives some headroom both ways to prevent underflow and overflow.
+    // We tweak current_ratio to encourage this.
+    constexpr double tweak_time_scale = 0.050; // seconds
+    const double tweak_correction = (backlog_fullness - 0.5) * (time_delta / tweak_time_scale);
+    current_ratio *= std::pow(1.0 + 2.0 * tweak_correction, tweak_correction < 0 ? 3.0 : 1.0);
+
+    // This low-pass filter smoothes out variance in the calculated stretch ratio.
+    // The time-scale determines how responsive this filter is.
+    constexpr double lpf_time_scale = 0.712; // seconds
+    const double lpf_gain = 1.0 - std::exp(-time_delta / lpf_time_scale);
+    stretch_ratio += lpf_gain * (current_ratio - stretch_ratio);
+
+    // Place a lower limit of 5% speed.  When a game boots up, there will be
+    // many silence samples.  These do not need to be timestretched.
+    stretch_ratio = std::max(stretch_ratio, 0.05);
+    sound_touch->setTempo(stretch_ratio);
+
+    LOG_DEBUG(Audio, "{:5}/{:5} ratio:{:0.6f} backlog:{:0.6f}", num_in, num_out, stretch_ratio,
+              backlog_fullness);
+
+    sound_touch->putSamples(in, num_in);
+    return sound_touch->receiveSamples(out, num_out);
+}
+
+void TimeStretcher::Clear() {
+    sound_touch->clear();
 }

 void TimeStretcher::Flush() {
-    impl->soundtouch.flush();
-}
-
-void TimeStretcher::Reset() {
-    impl->soundtouch.setTempo(1.0);
-    impl->soundtouch.clear();
-    impl->smoothed_ratio = 1.0;
-    impl->frame_timer = steady_clock::now();
-    impl->samples_queued = 0;
-    SetOutputSampleRate(native_sample_rate);
-}
-
-double TimeStretcher::CalculateCurrentRatio() {
-    const steady_clock::time_point now = steady_clock::now();
-    const std::chrono::duration<double> duration = now - impl->frame_timer;
-
-    const double expected_time =
-        static_cast<double>(impl->samples_queued) / static_cast<double>(native_sample_rate);
-    const double actual_time = duration.count();
-
-    double ratio;
-    if (expected_time != 0) {
-        ratio = ClampRatio(actual_time / expected_time);
-    } else {
-        ratio = impl->smoothed_ratio;
-    }
-
-    impl->frame_timer = now;
-    impl->samples_queued = 0;
-
-    return ratio;
-}
-
-double TimeStretcher::CorrectForUnderAndOverflow(double ratio, std::size_t sample_delay) const {
-    const std::size_t min_sample_delay =
-        static_cast<std::size_t>(MIN_DELAY_TIME * impl->sample_rate);
-    const std::size_t max_sample_delay =
-        static_cast<std::size_t>(MAX_DELAY_TIME * impl->sample_rate);
-
-    if (sample_delay < min_sample_delay) {
-        // Make the ratio bigger.
-        ratio = ratio > 1.0 ? ratio * ratio : sqrt(ratio);
-    } else if (sample_delay > max_sample_delay) {
-        // Make the ratio smaller.
-        ratio = ratio > 1.0 ? sqrt(ratio) : ratio * ratio;
-    }
-
-    return ClampRatio(ratio);
-}
-
-std::vector<s16> TimeStretcher::GetSamples() {
-    uint available = impl->soundtouch.numSamples();
-
-    std::vector<s16> output(static_cast<std::size_t>(available) * 2);
-
-    impl->soundtouch.receiveSamples(output.data(), available);
-
-    return output;
+    sound_touch->flush();
 }

 } // namespace AudioCore
--- a/src/audio_core/time_stretch.h
+++ b/src/audio_core/time_stretch.h
@ -4,57 +4,39 @@

 #pragma once

+#include <array>
 #include <cstddef>
 #include <memory>
-#include <vector>
 #include "common/common_types.h"

+namespace soundtouch {
+class SoundTouch;
+}
+
 namespace AudioCore {

-class TimeStretcher final {
+class TimeStretcher {
 public:
    TimeStretcher();
    ~TimeStretcher();

-    /**
-     * Set sample rate for the samples that Process returns.
-     * @param sample_rate The sample rate.
-     */
    void SetOutputSampleRate(unsigned int sample_rate);

-    /**
-     * Add samples to be processed.
-     * @param sample_buffer Buffer of samples in interleaved stereo PCM16 format.
-     * @param num_samples Number of samples.
-     */
-    void AddSamples(const s16* sample_buffer, std::size_t num_samples);
+    /// @param in       Input sample buffer
+    /// @param num_in   Number of input frames in `in`
+    /// @param out      Output sample buffer
+    /// @param num_out  Desired number of output frames in `out`
+    /// @returns Actual number of frames written to `out`
+    std::size_t Process(const s16* in, std::size_t num_in, s16* out, std::size_t num_out);
+
+    void Clear();

-    /// Flush audio remaining in internal buffers.
    void Flush();

-    /// Resets internal state and clears buffers.
-    void Reset();
-
-    /**
-     * Does audio stretching and produces the time-stretched samples.
-     * Timer calculations use sample_delay to determine how much of a margin we have.
-     * @param sample_delay How many samples are buffered downstream of this module and haven't been
-     * played yet.
-     * @return Samples to play in interleaved stereo PCM16 format.
-     */
-    std::vector<s16> Process(std::size_t sample_delay);
-
 private:
-    struct Impl;
-    std::unique_ptr<Impl> impl;
-
-    /// INTERNAL: ratio = wallclock time / emulated time
-    double CalculateCurrentRatio();
-    /// INTERNAL: If we have too many or too few samples downstream, nudge ratio in the appropriate
-    /// direction.
-    double CorrectForUnderAndOverflow(double ratio, std::size_t sample_delay) const;
-    /// INTERNAL: Gets the time-stretched samples from SoundTouch.
-    std::vector<s16> GetSamples();
+    unsigned int sample_rate;
+    std::unique_ptr<soundtouch::SoundTouch> sound_touch;
+    double stretch_ratio = 1.0;
 };

 } // namespace AudioCore
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@ -72,6 +72,7 @@ add_library(common STATIC
    param_package.cpp
    param_package.h
    quaternion.h
+    ring_buffer.h
    scm_rev.cpp
    scm_rev.h
    scope_exit.h
--- a/src/common/ring_buffer.h
+++ b/src/common/ring_buffer.h
@ -0,0 +1,111 @@
+// Copyright 2018 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstring>
+#include <type_traits>
+#include <vector>
+#include "common/common_types.h"
+
+namespace Common {
+
+/// SPSC ring buffer
+/// @tparam T            Element type
+/// @tparam capacity     Number of slots in ring buffer
+/// @tparam granularity  Slot size in terms of number of elements
+template <typename T, std::size_t capacity, std::size_t granularity = 1>
+class RingBuffer {
+    /// A "slot" is made of `granularity` elements of `T`.
+    static constexpr std::size_t slot_size = granularity * sizeof(T);
+    // T must be safely memcpy-able and have a trivial default constructor.
+    static_assert(std::is_trivial_v<T>);
+    // Ensure capacity is sensible.
+    static_assert(capacity < std::numeric_limits<std::size_t>::max() / 2 / granularity);
+    static_assert((capacity & (capacity - 1)) == 0, "capacity must be a power of two");
+    // Ensure lock-free.
+    static_assert(std::atomic<std::size_t>::is_always_lock_free);
+
+public:
+    /// Pushes slots into the ring buffer
+    /// @param new_slots   Pointer to the slots to push
+    /// @param slot_count  Number of slots to push
+    /// @returns The number of slots actually pushed
+    std::size_t Push(const void* new_slots, std::size_t slot_count) {
+        const std::size_t write_index = m_write_index.load();
+        const std::size_t slots_free = capacity + m_read_index.load() - write_index;
+        const std::size_t push_count = std::min(slot_count, slots_free);
+
+        const std::size_t pos = write_index % capacity;
+        const std::size_t first_copy = std::min(capacity - pos, push_count);
+        const std::size_t second_copy = push_count - first_copy;
+
+        const char* in = static_cast<const char*>(new_slots);
+        std::memcpy(m_data.data() + pos * granularity, in, first_copy * slot_size);
+        in += first_copy * slot_size;
+        std::memcpy(m_data.data(), in, second_copy * slot_size);
+
+        m_write_index.store(write_index + push_count);
+
+        return push_count;
+    }
+
+    std::size_t Push(const std::vector<T>& input) {
+        return Push(input.data(), input.size() / granularity);
+    }
+
+    /// Pops slots from the ring buffer
+    /// @param output     Where to store the popped slots
+    /// @param max_slots  Maximum number of slots to pop
+    /// @returns The number of slots actually popped
+    std::size_t Pop(void* output, std::size_t max_slots = ~std::size_t(0)) {
+        const std::size_t read_index = m_read_index.load();
+        const std::size_t slots_filled = m_write_index.load() - read_index;
+        const std::size_t pop_count = std::min(slots_filled, max_slots);
+
+        const std::size_t pos = read_index % capacity;
+        const std::size_t first_copy = std::min(capacity - pos, pop_count);
+        const std::size_t second_copy = pop_count - first_copy;
+
+        char* out = static_cast<char*>(output);
+        std::memcpy(out, m_data.data() + pos * granularity, first_copy * slot_size);
+        out += first_copy * slot_size;
+        std::memcpy(out, m_data.data(), second_copy * slot_size);
+
+        m_read_index.store(read_index + pop_count);
+
+        return pop_count;
+    }
+
+    std::vector<T> Pop(std::size_t max_slots = ~std::size_t(0)) {
+        std::vector<T> out(std::min(max_slots, capacity) * granularity);
+        const std::size_t count = Pop(out.data(), out.size() / granularity);
+        out.resize(count * granularity);
+        return out;
+    }
+
+    /// @returns Number of slots used
+    std::size_t Size() const {
+        return m_write_index.load() - m_read_index.load();
+    }
+
+    /// @returns Maximum size of ring buffer
+    constexpr std::size_t Capacity() const {
+        return capacity;
+    }
+
+private:
+    // It is important to align the below variables for performance reasons:
+    // Having them on the same cache-line would result in false-sharing between them.
+    alignas(128) std::atomic<std::size_t> m_read_index{0};
+    alignas(128) std::atomic<std::size_t> m_write_index{0};
+
+    std::array<T, granularity * capacity> m_data;
+};
+
+} // namespace Common