aspnetcore/src/IISLib/percpu.h

305 lines
6.3 KiB
C++

// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the MIT License. See License.txt in the project root for license information.
#pragma once
template<typename T>
class PER_CPU
{
public:
template<typename FunctionInitializer>
inline
static
HRESULT
Create(
FunctionInitializer Initializer,
__deref_out PER_CPU<T> ** ppInstance
);
inline
T *
GetLocal(
VOID
);
template<typename FunctionForEach>
inline
VOID
ForEach(
FunctionForEach Function
);
inline
VOID
Dispose(
VOID
);
private:
PER_CPU(
VOID
)
{
//
// Don't perform any operation during constructor.
// Constructor will never be called.
//
}
~PER_CPU(
VOID
)
{
//
// Don't perform any operation during destructor.
// Constructor will never be called.
//
}
template<typename FunctionInitializer>
HRESULT
Initialize(
FunctionInitializer Initializer,
DWORD NumberOfVariables,
DWORD Alignment
);
T *
GetObject(
DWORD Index
);
static
HRESULT
GetProcessorInformation(
__out DWORD * pCacheLineSize,
__out DWORD * pNumberOfProcessors
);
//
// Pointer to the begining of the inlined array.
//
PVOID m_pVariables;
SIZE_T m_Alignment;
SIZE_T m_VariablesCount;
};
template<typename T>
template<typename FunctionInitializer>
inline
// static
HRESULT
PER_CPU<T>::Create(
FunctionInitializer Initializer,
__deref_out PER_CPU<T> ** ppInstance
)
{
HRESULT hr = S_OK;
DWORD CacheLineSize = 0;
DWORD ObjectCacheLineSize = 0;
DWORD NumberOfProcessors = 0;
PER_CPU<T> * pInstance = NULL;
hr = GetProcessorInformation(&CacheLineSize,
&NumberOfProcessors);
if (FAILED(hr))
{
goto Finished;
}
if (sizeof(T) > CacheLineSize)
{
//
// Round to the next multiple of the cache line size.
//
ObjectCacheLineSize = (sizeof(T) + CacheLineSize-1) & (CacheLineSize-1);
}
else
{
ObjectCacheLineSize = CacheLineSize;
}
//
// Calculate the size of the PER_CPU<T> object, including the array.
// The first cache line is for the member variables and the array
// starts in the next cache line.
//
SIZE_T Size = CacheLineSize + NumberOfProcessors * ObjectCacheLineSize;
pInstance = (PER_CPU<T>*) _aligned_malloc(Size, CacheLineSize);
if (pInstance == NULL)
{
hr = E_OUTOFMEMORY;
goto Finished;
}
ZeroMemory(pInstance, Size);
//
// The array start in the 2nd cache line.
//
pInstance->m_pVariables = reinterpret_cast<PBYTE>(pInstance) + CacheLineSize;
//
// Pass a disposer for disposing initialized items in case of failure.
//
hr = pInstance->Initialize(Initializer,
NumberOfProcessors,
ObjectCacheLineSize);
if (FAILED(hr))
{
goto Finished;
}
*ppInstance = pInstance;
pInstance = NULL;
Finished:
if (pInstance != NULL)
{
//
// Free the instance without disposing it.
//
pInstance->Dispose();
pInstance = NULL;
}
return hr;
}
template<typename T>
inline
T *
PER_CPU<T>::GetLocal(
VOID
)
{
// Use GetCurrentProcessorNumber (up to 64 logical processors) instead of
// GetCurrentProcessorNumberEx (more than 64 logical processors) because
// the number of processors are not densely packed per group.
// The idea of distributing variables per CPU is to have
// a scalability multiplier (could be NUMA node instead).
//
// Make sure the index don't go beyond the array size, if that happens,
// there won't be even distribution, but still better
// than one single variable.
//
return GetObject(GetCurrentProcessorNumber());
}
template<typename T>
inline
T *
PER_CPU<T>::GetObject(
DWORD Index
)
{
return reinterpret_cast<T*>(static_cast<PBYTE>(m_pVariables) + Index * m_Alignment);
}
template<typename T>
template<typename FunctionForEach>
inline
VOID
PER_CPU<T>::ForEach(
FunctionForEach Function
)
{
for(DWORD Index = 0; Index < m_VariablesCount; ++Index)
{
T * pObject = GetObject(Index);
Function(pObject);
}
}
template<typename T>
VOID
PER_CPU<T>::Dispose(
VOID
)
{
_aligned_free(this);
}
template<typename T>
template<typename FunctionInitializer>
inline
HRESULT
PER_CPU<T>::Initialize(
FunctionInitializer Initializer,
DWORD NumberOfVariables,
DWORD Alignment
)
/*++
Routine Description:
Initialize each object using the initializer function.
If initialization for any object fails, it dispose the
objects that were successfully initialized.
Arguments:
Initializer - Function for initialize one object.
Signature: HRESULT Func(T*)
Dispose - Function for disposing initialized objects in case of failure.
Signature: void Func(T*)
NumberOfVariables - The length of the array of variables.
Alignment - Alignment to use for avoiding false sharing.
Return:
HRESULT - E_OUTOFMEMORY
--*/
{
HRESULT hr = S_OK;
DWORD Index = 0;
m_VariablesCount = NumberOfVariables;
m_Alignment = Alignment;
for (; Index < m_VariablesCount; ++Index)
{
T * pObject = GetObject(Index);
Initializer(pObject);
}
return hr;
}
template<typename T>
// static
HRESULT
PER_CPU<T>::GetProcessorInformation(
__out DWORD * pCacheLineSize,
__out DWORD * pNumberOfProcessors
)
/*++
Routine Description:
Gets the CPU cache-line size for the current system.
This information is used for avoiding CPU false sharing.
Arguments:
pCacheLineSize - The processor cache-line size.
pNumberOfProcessors - Maximum number of processors per group.
Return:
HRESULT - E_OUTOFMEMORY
--*/
{
SYSTEM_INFO SystemInfo = { };
GetSystemInfo(&SystemInfo);
*pNumberOfProcessors = SystemInfo.dwNumberOfProcessors;
*pCacheLineSize = SYSTEM_CACHE_ALIGNMENT_SIZE;
return S_OK;
}