def transpose(matrix uint16) : uint16 = {}
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
0 4 8 12
1 5 9 13
2 6 10 13
3 7 11 15
0 -3 -6 -9
3 0 -3 -6
...
answer = (source & 0x1248) | ((source << 3) | 0x2480) | ...
Типа так сложно сделать 76 лишних пикселей?
Span<T>
):byte[,] data2D = ...;
int width = data2D.GetLength(1);
// Создаем Span byte из data2D.
Span<byte> data1D = MemoryMarshal.CreateSpan(ref data2D[0, 0], data2D.Length);
// Передаем Span byte в функцию.
ProcessData(data1D, width);
Как перестать поддерживать ее, при выполнении клиентского кода в ходе динамической трансляции.
#if defined(TARGET_XARCH)
#if defined(TARGET_X86)
/*
REGDEF(name, rnum, mask, sname) */
REGDEF(EAX, 0, 0x01, "eax" )
REGDEF(ECX, 1, 0x02, "ecx" )
REGDEF(EDX, 2, 0x04, "edx" )
REGDEF(EBX, 3, 0x08, "ebx" )
REGDEF(ESP, 4, 0x10, "esp" )
REGDEF(EBP, 5, 0x20, "ebp" )
REGDEF(ESI, 6, 0x40, "esi" )
REGDEF(EDI, 7, 0x80, "edi" )
REGALIAS(RAX, EAX)
REGALIAS(RCX, ECX)
REGALIAS(RDX, EDX)
REGALIAS(RBX, EBX)
REGALIAS(RSP, ESP)
REGALIAS(RBP, EBP)
REGALIAS(RSI, ESI)
REGALIAS(RDI, EDI)
#else // !defined(TARGET_X86)
/*
REGDEF(name, rnum, mask, sname) */
REGDEF(RAX, 0, 0x0001, "rax" )
REGDEF(RCX, 1, 0x0002, "rcx" )
REGDEF(RDX, 2, 0x0004, "rdx" )
REGDEF(RBX, 3, 0x0008, "rbx" )
REGDEF(RSP, 4, 0x0010, "rsp" )
REGDEF(RBP, 5, 0x0020, "rbp" )
REGDEF(RSI, 6, 0x0040, "rsi" )
REGDEF(RDI, 7, 0x0080, "rdi" )
REGDEF(R8, 8, 0x0100, "r8" )
REGDEF(R9, 9, 0x0200, "r9" )
REGDEF(R10, 10, 0x0400, "r10" )
REGDEF(R11, 11, 0x0800, "r11" )
REGDEF(R12, 12, 0x1000, "r12" )
REGDEF(R13, 13, 0x2000, "r13" )
REGDEF(R14, 14, 0x4000, "r14" )
REGDEF(R15, 15, 0x8000, "r15" )
REGALIAS(EAX, RAX)
REGALIAS(ECX, RCX)
REGALIAS(EDX, RDX)
REGALIAS(EBX, RBX)
REGALIAS(ESP, RSP)
REGALIAS(EBP, RBP)
REGALIAS(ESI, RSI)
REGALIAS(EDI, RDI)
#endif // !defined(TARGET_X86)
// Кодогенератор
void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
{
GenTree* op1 = treeNode->gtGetOp1();
GenTree* op2 = treeNode->gtGetOp2();
instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
regNumber op1reg = op1->isUsedFromReg() ? op1->GetRegNum() : REG_NA;
regNumber op2reg = op2->isUsedFromReg() ? op2->GetRegNum() : REG_NA;
GenTree* dst;
GenTree* src;
// This is the case of reg1 = reg1 op reg2
// We're ready to emit the instruction without any moves
if (op1reg == targetReg)
{
dst = op1;
src = op2;
}
// We have reg1 = reg2 op reg1
// In order for this operation to be correct
// we need that op is a commutative operation so
// we can convert it into reg1 = reg1 op reg2 and emit
// the same code as above
else if (op2reg == targetReg)
{
dst = op2;
src = op1;
}
// dest, op1 and op2 registers are different:
// reg3 = reg1 op reg2
// We can implement this by issuing a mov:
// reg3 = reg1
// reg3 = reg3 op reg2
else
{
var_types op1Type = op1->TypeGet();
inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false);
regSet.verifyRegUsed(targetReg);
gcInfo.gcMarkRegPtrVal(targetReg, op1Type);
dst = treeNode;
src = op2;
}
// try to use an inc or dec
if (oper == GT_ADD && !varTypeIsFloating(treeNode) && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
{
if (src->IsIntegralConst(1))
{
emit->emitIns_R(INS_inc, emitTypeSize(treeNode), targetReg);
genProduceReg(treeNode);
return;
}
else if (src->IsIntegralConst(-1))
{
emit->emitIns_R(INS_dec, emitTypeSize(treeNode), targetReg);
genProduceReg(treeNode);
return;
}
}
regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
}
// Эммитер
/*****************************************************************************
*
* Add an instruction with two register operands.
*/
void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, insOpts instOptions)
{
if (IsMovInstruction(ins))
{
assert(!"Please use emitIns_Mov() to correctly handle move elision");
emitIns_Mov(ins, attr, reg1, reg2, /* canSkip */ false);
}
emitAttr size = EA_SIZE(attr);
assert(size <= EA_64BYTE);
noway_assert(emitVerifyEncodable(ins, size, reg1, reg2));
/* Special case: "XCHG" uses a different format */
insFormat fmt = (ins == INS_xchg) ? IF_RRW_RRW : emitInsModeFormat(ins, IF_RRD_RRD);
instrDesc* id = emitNewInstrSmall(attr);
id->idIns(ins);
id->idInsFmt(fmt);
id->idReg1(reg1);
id->idReg2(reg2);
if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE)
{
// if EVEX.b needs to be set in this path, then it should be embedded rounding.
assert(UseEvexEncoding());
id->idSetEvexbContext(instOptions);
}
UNATIVE_OFFSET sz = emitInsSizeRR(id);
id->idCodeSize(sz);
dispIns(id);
emitCurIGsize += sz;
}
Почему программы с одних адресов начинаются?
Вот есть Tlb, представим это как линейный массив, или c , если все адреса будут одинаковы. То Очевидно что tlb будет работать в 1% своего множества
// Initialize ILGPU.
Context context = Context.CreateDefault();
Accelerator accelerator = context.CreateCLAccelerator(1);//context.GetPreferredDevice(preferCPU: false) .CreateAccelerator(context);
// Load the data.
using MemoryBuffer1D<float, Stride1D.Dense> deviceData = accelerator.Allocate1D(input);
using MemoryBuffer1D<float, Stride1D.Dense> deviceOutput = accelerator.Allocate1D<float>(output);
// load / precompile the kernel
Action<Index1D, ArrayView<float>, ArrayView<float>> loadedKernel =
accelerator.LoadAutoGroupedStreamKernel<Index1D, ArrayView<float>, ArrayView<float>>(Kernel);
// finish compiling and tell the accelerator to start computing the kernel
loadedKernel((int)deviceOutput.Length, deviceData.View, deviceOutput.View);
accelerator.Synchronize();
Setup
метод. Раз уж ты пользуешься BenchmarkDotNet, то вот помощь с этимpublic class SampleBenchmark
{
static void Kernel(Index1D i, ArrayView<float> data, ArrayView<float> output)
{
output[i] = data[i % data.Length];
}
public static IEnumerable<object[]> Arguments => new[] {new object[]{new float[1000000], new float[1000000]} };
private float[] _outputBuffer = new float[1000000];
private float[] _inputBuffer = new float[1000000];
private Context? _context;
private Accelerator? _accelerator;
private Action<Index1D, ArrayView<float>, ArrayView<float>>? _loadedKernel;
private MemoryBuffer1D<float, Stride1D.Dense>? _deviceData;
private MemoryBuffer1D<float, Stride1D.Dense>? _deviceOutput;
[GlobalSetup]
public void Setup()
{
var random = new Random();
for (var i = 0; i < _inputBuffer.Length; i++)
{
_inputBuffer[i] = random.NextSingle();
}
_context = Context.CreateDefault();
_accelerator = _context.GetPreferredDevice(preferCPU: false).CreateAccelerator(_context);
_loadedKernel = _accelerator!.LoadAutoGroupedStreamKernel<Index1D, ArrayView<float>, ArrayView<float>>(Kernel);
_deviceData = _accelerator!.Allocate1D(_inputBuffer);
_deviceOutput = _accelerator!.Allocate1D(_outputBuffer);
}
[GlobalCleanup]
public void TearDown()
{
_context?.Dispose();
_accelerator?.Dispose();
_deviceData?.Dispose();
_deviceOutput?.Dispose();
}
[ArgumentsSource(nameof(Arguments))]
[Benchmark]
public void GPUTest(float[] input, float[] output)
{
// finish compiling and tell the accelerator to start computing the kernel
_loadedKernel!((int)_deviceOutput.Length, _deviceData.View, _deviceOutput.View);
_accelerator!.Synchronize();
}
[Benchmark]
[ArgumentsSource(nameof(Arguments))]
public void CpuTest(float[] input, float[] output)
{
for (var i = 0; i < input.Length; i++)
{
output[i] = input[i];
}
}
}
| Method | input | output | Mean | Error | StdDev |
|-------- |---------------- |---------------- |----------:|---------:|---------:|
| GPUTest | Single[1000000] | Single[1000000] | 61.18 us | 0.101 us | 0.095 us |
| CpuTest | Single[1000000] | Single[1000000] | 243.54 us | 3.114 us | 2.600 us |
var typeToFunc = new Dictionary<Type, AddFunc>() {{typeof(int), IntAdd}, {typeof(Vector2), Vector2Add}};
var left = 123;
var right = 14455;
var result = Add(left, right);
Console.WriteLine($"Результат сложения {left} и {right} = {result}");
var leftVector = new Vector2(123, 55);
var rightVector = new Vector2(55, 111);
var resultVector = Add(leftVector, rightVector);
Console.WriteLine($"Результат сложения {leftVector} и {rightVector} = {resultVector}");
T Add<T>(T left, T right)
{
return ( T ) typeToFunc[typeof(T)](left, right);
}
object Vector2Add(object left, object right)
{
return ( Vector2 ) left + ( Vector2 ) right;
}
object IntAdd(object left, object right)
{
return (int) left + (int) right;
}
delegate object AddFunc(object left, object right);
var number = NumberOrVector2<int>.FromNumber(123);
var newNumber = number.Add(() => 23, () => throw new InvalidOperationException("хранится число"));
if (newNumber.TryGetNumber(out var result))
{
Console.WriteLine($"Получился результат: {result}");
}
else
{
Console.WriteLine($"Ошибка - хранился вектор");
}
public readonly struct NumberOrVector2<TNumber> where TNumber: unmanaged, INumber<TNumber>
{
private readonly TNumber _number;
private readonly Vector2? _vector;
private NumberOrVector2(TNumber number, Vector2? vector)
{
_number = number;
_vector = vector;
}
public bool TryGetNumber(out TNumber number)
{
number = _number;
return !_vector.HasValue;
}
public bool TryGetVector(out Vector2 vector)
{
vector = _vector.GetValueOrDefault();
return _vector.HasValue;
}
public NumberOrVector2<TNumber> Add(Func<TNumber> numberAdd, Func<Vector2> vectorAdd)
{
if (_vector is {} vector)
{
return new NumberOrVector2<TNumber>(_number, vector + vectorAdd());
}
return new NumberOrVector2<TNumber>(_number + numberAdd(), null);
}
public static NumberOrVector2<TNumber> FromNumber(TNumber number)
{
return new NumberOrVector2<TNumber>(number, null);
}
public static NumberOrVector2<TNumber> FromVector(Vector2 vector)
{
return new NumberOrVector2<TNumber>(default, vector);
}
}