gl_query_cache: Optimize query cache

Use a custom cache instead of relying on a ranged cache.
This commit is contained in:
ReinUsesLisp 2019-11-28 02:15:34 -03:00
parent aae8c180cb
commit 73d2d3342d
6 changed files with 217 additions and 79 deletions

View file

@ -4,6 +4,7 @@
#include <cinttypes> #include <cinttypes>
#include <cstring> #include <cstring>
#include <optional>
#include "common/assert.h" #include "common/assert.h"
#include "core/core.h" #include "core/core.h"
#include "core/core_timing.h" #include "core/core_timing.h"
@ -16,6 +17,8 @@
namespace Tegra::Engines { namespace Tegra::Engines {
using VideoCore::QueryType;
/// First register id that is actually a Macro call. /// First register id that is actually a Macro call.
constexpr u32 MacroRegistersStart = 0xE00; constexpr u32 MacroRegistersStart = 0xE00;
@ -614,10 +617,11 @@ void Maxwell3D::ProcessQueryCondition() {
void Maxwell3D::ProcessCounterReset() { void Maxwell3D::ProcessCounterReset() {
switch (regs.counter_reset) { switch (regs.counter_reset) {
case Regs::CounterReset::SampleCnt: case Regs::CounterReset::SampleCnt:
rasterizer.ResetCounter(VideoCore::QueryType::SamplesPassed); rasterizer.ResetCounter(QueryType::SamplesPassed);
break; break;
default: default:
UNIMPLEMENTED_MSG("counter_reset={}", static_cast<u32>(regs.counter_reset)); LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
static_cast<int>(regs.counter_reset));
break; break;
} }
} }
@ -670,7 +674,8 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
return 0; return 0;
case Regs::QuerySelect::SamplesPassed: case Regs::QuerySelect::SamplesPassed:
// Deferred. // Deferred.
rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed); rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
system.GPU().GetTicks());
return {}; return {};
default: default:
UNIMPLEMENTED_MSG("Unimplemented query select type {}", UNIMPLEMENTED_MSG("Unimplemented query select type {}",

View file

@ -6,6 +6,7 @@
#include <atomic> #include <atomic>
#include <functional> #include <functional>
#include <optional>
#include "common/common_types.h" #include "common/common_types.h"
#include "video_core/engines/fermi_2d.h" #include "video_core/engines/fermi_2d.h"
#include "video_core/gpu.h" #include "video_core/gpu.h"
@ -50,7 +51,7 @@ public:
virtual void ResetCounter(QueryType type) = 0; virtual void ResetCounter(QueryType type) = 0;
/// Records a GPU query and caches it /// Records a GPU query and caches it
virtual void Query(GPUVAddr gpu_addr, QueryType type) = 0; virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
/// Notify rasterizer that all caches should be flushed to Switch memory /// Notify rasterizer that all caches should be flushed to Switch memory
virtual void FlushAll() = 0; virtual void FlushAll() = 0;

View file

@ -2,8 +2,10 @@
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include <algorithm>
#include <cstring> #include <cstring>
#include <memory> #include <memory>
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
@ -22,6 +24,13 @@ using VideoCore::QueryType;
namespace { namespace {
constexpr std::uintptr_t PAGE_SIZE = 4096;
constexpr int PAGE_SHIFT = 12;
constexpr std::size_t SMALL_QUERY_SIZE = 8; // Query size without timestamp
constexpr std::size_t LARGE_QUERY_SIZE = 16; // Query size with timestamp
constexpr std::ptrdiff_t TIMESTAMP_OFFSET = 8;
constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED}; constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED};
constexpr GLenum GetTarget(QueryType type) { constexpr GLenum GetTarget(QueryType type) {
@ -37,23 +46,19 @@ CounterStream::~CounterStream() = default;
void CounterStream::Update(bool enabled, bool any_command_queued) { void CounterStream::Update(bool enabled, bool any_command_queued) {
if (enabled) { if (enabled) {
if (!current) { Enable();
current = cache.GetHostCounter(last, type); } else {
} Disable(any_command_queued);
return;
} }
if (current) {
EndQuery(any_command_queued);
}
last = std::exchange(current, nullptr);
} }
void CounterStream::Reset(bool any_command_queued) { void CounterStream::Reset(bool any_command_queued) {
if (current) { if (current) {
EndQuery(any_command_queued); EndQuery(any_command_queued);
// Immediately start a new query to avoid disabling its state.
current = cache.GetHostCounter(nullptr, type);
} }
current = nullptr;
last = nullptr; last = nullptr;
} }
@ -67,6 +72,20 @@ std::shared_ptr<HostCounter> CounterStream::GetCurrent(bool any_command_queued)
return last; return last;
} }
void CounterStream::Enable() {
if (current) {
return;
}
current = cache.GetHostCounter(last, type);
}
void CounterStream::Disable(bool any_command_queued) {
if (current) {
EndQuery(any_command_queued);
}
last = std::exchange(current, nullptr);
}
void CounterStream::EndQuery(bool any_command_queued) { void CounterStream::EndQuery(bool any_command_queued) {
if (!any_command_queued) { if (!any_command_queued) {
// There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not // There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not
@ -78,26 +97,57 @@ void CounterStream::EndQuery(bool any_command_queued) {
} }
QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& rasterizer) QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& rasterizer)
: RasterizerCache{rasterizer}, system{system}, : system{system}, rasterizer{rasterizer}, streams{{CounterStream{*this,
rasterizer{rasterizer}, streams{{CounterStream{*this, QueryType::SamplesPassed}}} {} QueryType::SamplesPassed}}} {}
QueryCache::~QueryCache() = default; QueryCache::~QueryCache() = default;
void QueryCache::Query(GPUVAddr gpu_addr, QueryType type) { void QueryCache::InvalidateRegion(CacheAddr addr, std::size_t size) {
const u64 addr_begin = static_cast<u64>(addr);
const u64 addr_end = addr_begin + static_cast<u64>(size);
const auto in_range = [addr_begin, addr_end](CachedQuery& query) {
const u64 cache_begin = query.GetCacheAddr();
const u64 cache_end = cache_begin + query.GetSizeInBytes();
return cache_begin < addr_end && addr_begin < cache_end;
};
const u64 page_end = addr_end >> PAGE_SHIFT;
for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
const auto& it = cached_queries.find(page);
if (it == std::end(cached_queries)) {
continue;
}
auto& contents = it->second;
for (auto& query : contents) {
if (!in_range(query)) {
continue;
}
rasterizer.UpdatePagesCachedCount(query.GetCpuAddr(), query.GetSizeInBytes(), -1);
Flush(query);
}
contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range),
std::end(contents));
}
}
void QueryCache::FlushRegion(CacheAddr addr, std::size_t size) {
// We can handle flushes in the same way as invalidations.
InvalidateRegion(addr, size);
}
void QueryCache::Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) {
auto& memory_manager = system.GPU().MemoryManager(); auto& memory_manager = system.GPU().MemoryManager();
const auto host_ptr = memory_manager.GetPointer(gpu_addr); const auto host_ptr = memory_manager.GetPointer(gpu_addr);
auto query = TryGet(host_ptr); CachedQuery* query = TryGet(ToCacheAddr(host_ptr));
if (!query) { if (!query) {
const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
ASSERT_OR_EXECUTE(cpu_addr, return;); ASSERT_OR_EXECUTE(cpu_addr, return;);
query = std::make_shared<CachedQuery>(type, *cpu_addr, host_ptr); query = &Register(CachedQuery(type, *cpu_addr, host_ptr));
Register(query);
} }
query->SetCounter(GetStream(type).GetCurrent(rasterizer.AnyCommandQueued())); query->SetCounter(GetStream(type).GetCurrent(rasterizer.AnyCommandQueued()), timestamp);
query->MarkAsModified(true, *this);
} }
void QueryCache::UpdateCounters() { void QueryCache::UpdateCounters() {
@ -117,34 +167,54 @@ void QueryCache::Reserve(QueryType type, OGLQuery&& query) {
std::shared_ptr<HostCounter> QueryCache::GetHostCounter(std::shared_ptr<HostCounter> dependency, std::shared_ptr<HostCounter> QueryCache::GetHostCounter(std::shared_ptr<HostCounter> dependency,
QueryType type) { QueryType type) {
const auto type_index = static_cast<std::size_t>(type); auto& reserve = reserved_queries[static_cast<std::size_t>(type)];
auto& reserve = reserved_queries[type_index]; OGLQuery query;
if (reserve.empty()) { if (reserve.empty()) {
return std::make_shared<HostCounter>(*this, std::move(dependency), type); query.Create(GetTarget(type));
} else {
query = std::move(reserve.back());
reserve.pop_back();
} }
auto counter = std::make_shared<HostCounter>(*this, std::move(dependency), type, return std::make_shared<HostCounter>(*this, std::move(dependency), type, std::move(query));
std::move(reserve.back()));
reserve.pop_back();
return counter;
} }
void QueryCache::FlushObjectInner(const std::shared_ptr<CachedQuery>& counter_) { CachedQuery& QueryCache::Register(CachedQuery&& cached_query) {
auto& counter = *counter_; const u64 page = static_cast<u64>(cached_query.GetCacheAddr()) >> PAGE_SHIFT;
auto& stream = GetStream(counter.GetType()); auto& stored_ref = cached_queries[page].emplace_back(std::move(cached_query));
rasterizer.UpdatePagesCachedCount(stored_ref.GetCpuAddr(), stored_ref.GetSizeInBytes(), 1);
return stored_ref;
}
CachedQuery* QueryCache::TryGet(CacheAddr addr) {
const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
const auto it = cached_queries.find(page);
if (it == std::end(cached_queries)) {
return nullptr;
}
auto& contents = it->second;
const auto found =
std::find_if(std::begin(contents), std::end(contents),
[addr](const auto& query) { return query.GetCacheAddr() == addr; });
return found != std::end(contents) ? &*found : nullptr;
}
void QueryCache::Flush(CachedQuery& cached_query) {
auto& stream = GetStream(cached_query.GetType());
// Waiting for a query while another query of the same target is enabled locks Nvidia's driver. // Waiting for a query while another query of the same target is enabled locks Nvidia's driver.
// To avoid this disable and re-enable keeping the dependency stream. // To avoid this disable and re-enable keeping the dependency stream.
const bool is_enabled = stream.IsEnabled(); // But we only have to do this if we have pending waits to be done.
if (is_enabled) { const bool slice_counter = stream.IsEnabled() && cached_query.WaitPending();
stream.Update(false, false); const bool any_command_queued = rasterizer.AnyCommandQueued();
if (slice_counter) {
stream.Update(false, any_command_queued);
} }
counter.Flush(); cached_query.Flush();
if (is_enabled) { if (slice_counter) {
stream.Update(true, false); stream.Update(true, any_command_queued);
} }
} }
@ -152,13 +222,6 @@ CounterStream& QueryCache::GetStream(QueryType type) {
return streams[static_cast<std::size_t>(type)]; return streams[static_cast<std::size_t>(type)];
} }
HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, QueryType type)
: cache{cache}, type{type}, dependency{std::move(dependency)} {
const GLenum target = GetTarget(type);
query.Create(target);
glBeginQuery(target, query.handle);
}
HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, QueryType type, HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, QueryType type,
OGLQuery&& query_) OGLQuery&& query_)
: cache{cache}, type{type}, dependency{std::move(dependency)}, query{std::move(query_)} { : cache{cache}, type{type}, dependency{std::move(dependency)}, query{std::move(query_)} {
@ -170,35 +233,80 @@ HostCounter::~HostCounter() {
} }
u64 HostCounter::Query() { u64 HostCounter::Query() {
if (query.handle == 0) { if (result) {
return result; return *result;
} }
glGetQueryObjectui64v(query.handle, GL_QUERY_RESULT, &result); u64 value;
glGetQueryObjectui64v(query.handle, GL_QUERY_RESULT, &value);
if (dependency) { if (dependency) {
result += dependency->Query(); value += dependency->Query();
} }
return result; return *(result = value);
}
bool HostCounter::WaitPending() const noexcept {
return result.has_value();
} }
CachedQuery::CachedQuery(QueryType type, VAddr cpu_addr, u8* host_ptr) CachedQuery::CachedQuery(QueryType type, VAddr cpu_addr, u8* host_ptr)
: RasterizerCacheObject{host_ptr}, type{type}, cpu_addr{cpu_addr}, host_ptr{host_ptr} {} : type{type}, cpu_addr{cpu_addr}, host_ptr{host_ptr} {}
CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
: type{rhs.type}, cpu_addr{rhs.cpu_addr}, host_ptr{rhs.host_ptr},
counter{std::move(rhs.counter)}, timestamp{rhs.timestamp} {}
CachedQuery::~CachedQuery() = default; CachedQuery::~CachedQuery() = default;
CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept {
type = rhs.type;
cpu_addr = rhs.cpu_addr;
host_ptr = rhs.host_ptr;
counter = std::move(rhs.counter);
timestamp = rhs.timestamp;
return *this;
}
void CachedQuery::Flush() { void CachedQuery::Flush() {
const u64 value = counter->Query(); // When counter is nullptr it means that it's just been reseted. We are supposed to write a zero
std::memcpy(host_ptr, &value, sizeof(value)); // in these cases.
const u64 value = counter ? counter->Query() : 0;
std::memcpy(host_ptr, &value, sizeof(u64));
if (timestamp) {
std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64));
}
} }
void CachedQuery::SetCounter(std::shared_ptr<HostCounter> counter_) { void CachedQuery::SetCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) {
if (counter) {
// If there's an old counter set it means the query is being rewritten by the game.
// To avoid losing the data forever, flush here.
Flush();
}
counter = std::move(counter_); counter = std::move(counter_);
timestamp = timestamp_;
} }
QueryType CachedQuery::GetType() const { bool CachedQuery::WaitPending() const noexcept {
return counter && counter->WaitPending();
}
QueryType CachedQuery::GetType() const noexcept {
return type; return type;
} }
VAddr CachedQuery::GetCpuAddr() const noexcept {
return cpu_addr;
}
CacheAddr CachedQuery::GetCacheAddr() const noexcept {
return ToCacheAddr(host_ptr);
}
u64 CachedQuery::GetSizeInBytes() const noexcept {
return timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE;
}
} // namespace OpenGL } // namespace OpenGL

View file

@ -7,12 +7,12 @@
#include <array> #include <array>
#include <memory> #include <memory>
#include <optional> #include <optional>
#include <unordered_map>
#include <vector> #include <vector>
#include <glad/glad.h> #include <glad/glad.h>
#include "common/common_types.h" #include "common/common_types.h"
#include "video_core/rasterizer_cache.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_resource_manager.h"
@ -43,6 +43,10 @@ public:
} }
private: private:
void Enable();
void Disable(bool any_command_queued);
void EndQuery(bool any_command_queued); void EndQuery(bool any_command_queued);
QueryCache& cache; QueryCache& cache;
@ -53,12 +57,16 @@ private:
GLenum target; GLenum target;
}; };
class QueryCache final : public RasterizerCache<std::shared_ptr<CachedQuery>> { class QueryCache final {
public: public:
explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer); explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer);
~QueryCache(); ~QueryCache();
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type); void InvalidateRegion(CacheAddr addr, std::size_t size);
void FlushRegion(CacheAddr addr, std::size_t size);
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp);
void UpdateCounters(); void UpdateCounters();
@ -69,23 +77,26 @@ public:
std::shared_ptr<HostCounter> GetHostCounter(std::shared_ptr<HostCounter> dependency, std::shared_ptr<HostCounter> GetHostCounter(std::shared_ptr<HostCounter> dependency,
VideoCore::QueryType type); VideoCore::QueryType type);
protected:
void FlushObjectInner(const std::shared_ptr<CachedQuery>& counter) override;
private: private:
CachedQuery& Register(CachedQuery&& cached_query);
CachedQuery* TryGet(CacheAddr addr);
void Flush(CachedQuery& cached_query);
CounterStream& GetStream(VideoCore::QueryType type); CounterStream& GetStream(VideoCore::QueryType type);
Core::System& system; Core::System& system;
RasterizerOpenGL& rasterizer; RasterizerOpenGL& rasterizer;
std::unordered_map<u64, std::vector<CachedQuery>> cached_queries;
std::array<CounterStream, VideoCore::NumQueryTypes> streams; std::array<CounterStream, VideoCore::NumQueryTypes> streams;
std::array<std::vector<OGLQuery>, VideoCore::NumQueryTypes> reserved_queries; std::array<std::vector<OGLQuery>, VideoCore::NumQueryTypes> reserved_queries;
}; };
class HostCounter final { class HostCounter final {
public: public:
explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
VideoCore::QueryType type);
explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
VideoCore::QueryType type, OGLQuery&& query); VideoCore::QueryType type, OGLQuery&& query);
~HostCounter(); ~HostCounter();
@ -93,42 +104,54 @@ public:
/// Returns the current value of the query. /// Returns the current value of the query.
u64 Query(); u64 Query();
/// Returns true when querying this counter will potentially wait for OpenGL.
bool WaitPending() const noexcept;
private: private:
QueryCache& cache; QueryCache& cache;
VideoCore::QueryType type; VideoCore::QueryType type;
std::shared_ptr<HostCounter> dependency; ///< Counter queued before this one. std::shared_ptr<HostCounter> dependency; ///< Counter queued before this one.
OGLQuery query; ///< OpenGL query. OGLQuery query; ///< OpenGL query.
u64 result; ///< Added values of the counter. std::optional<u64> result; ///< Added values of the counter.
}; };
class CachedQuery final : public RasterizerCacheObject { class CachedQuery final {
public: public:
explicit CachedQuery(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr); explicit CachedQuery(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr);
CachedQuery(CachedQuery&&) noexcept;
CachedQuery(const CachedQuery&) = delete;
~CachedQuery(); ~CachedQuery();
CachedQuery& operator=(CachedQuery&&) noexcept;
/// Writes the counter value to host memory. /// Writes the counter value to host memory.
void Flush(); void Flush();
/// Updates the counter this cached query registered in guest memory will write when requested. /// Updates the counter this cached query registered in guest memory will write when requested.
void SetCounter(std::shared_ptr<HostCounter> counter); void SetCounter(std::shared_ptr<HostCounter> counter, std::optional<u64> timestamp);
/// Returns true when a flushing this query will potentially wait for OpenGL.
bool WaitPending() const noexcept;
/// Returns the query type. /// Returns the query type.
VideoCore::QueryType GetType() const; VideoCore::QueryType GetType() const noexcept;
VAddr GetCpuAddr() const override { /// Returns the guest CPU address for this query.
return cpu_addr; VAddr GetCpuAddr() const noexcept;
}
std::size_t GetSizeInBytes() const override { /// Returns the cache address for this query.
return sizeof(u64); CacheAddr GetCacheAddr() const noexcept;
}
/// Returns the number of cached bytes.
u64 GetSizeInBytes() const noexcept;
private: private:
VideoCore::QueryType type; VideoCore::QueryType type; ///< Abstracted query type (e.g. samples passed).
VAddr cpu_addr; ///< Guest CPU address. VAddr cpu_addr; ///< Guest CPU address.
u8* host_ptr; ///< Writable host pointer. u8* host_ptr; ///< Writable host pointer.
std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree. std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree.
std::optional<u64> timestamp; ///< Timestamp to flush to guest memory.
}; };
} // namespace OpenGL } // namespace OpenGL

View file

@ -722,8 +722,9 @@ void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) {
query_cache.ResetCounter(type); query_cache.ResetCounter(type);
} }
void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type) { void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
query_cache.Query(gpu_addr, type); std::optional<u64> timestamp) {
query_cache.Query(gpu_addr, type, timestamp);
} }
void RasterizerOpenGL::FlushAll() {} void RasterizerOpenGL::FlushAll() {}

View file

@ -63,7 +63,7 @@ public:
void Clear() override; void Clear() override;
void DispatchCompute(GPUVAddr code_addr) override; void DispatchCompute(GPUVAddr code_addr) override;
void ResetCounter(VideoCore::QueryType type) override; void ResetCounter(VideoCore::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type) override; void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
void FlushAll() override; void FlushAll() override;
void FlushRegion(CacheAddr addr, u64 size) override; void FlushRegion(CacheAddr addr, u64 size) override;
void InvalidateRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override;