mirror of
https://src.fedoraproject.org/rpms/mesa.git
synced 2024-11-24 09:32:42 +00:00
1583 lines
65 KiB
Diff
1583 lines
65 KiB
Diff
diff --git a/src/amd/compiler/tests/main.cpp b/src/amd/compiler/tests/main.cpp
|
|
index cb646e2dd30..eac0a244adf 100644
|
|
--- a/src/amd/compiler/tests/main.cpp
|
|
+++ b/src/amd/compiler/tests/main.cpp
|
|
@@ -34,6 +34,8 @@
|
|
#include "aco_ir.h"
|
|
#include "framework.h"
|
|
|
|
+#include "util/u_cpu_detect.h"
|
|
+
|
|
static const char *help_message =
|
|
"Usage: %s [-h] [-l --list] [--no-check] [TEST [TEST ...]]\n"
|
|
"\n"
|
|
@@ -227,6 +229,8 @@ int main(int argc, char **argv)
|
|
return 99;
|
|
}
|
|
|
|
+ util_cpu_detect();
|
|
+
|
|
if (do_list) {
|
|
for (auto test : tests)
|
|
printf("%s\n", test.first.c_str());
|
|
diff --git a/src/compiler/glsl/standalone.cpp b/src/compiler/glsl/standalone.cpp
|
|
index ca187001186..2714d8b95ed 100644
|
|
--- a/src/compiler/glsl/standalone.cpp
|
|
+++ b/src/compiler/glsl/standalone.cpp
|
|
@@ -401,6 +401,8 @@ standalone_compile_shader(const struct standalone_options *_options,
|
|
int status = EXIT_SUCCESS;
|
|
bool glsl_es = false;
|
|
|
|
+ util_cpu_detect();
|
|
+
|
|
options = _options;
|
|
|
|
switch (options->glsl_version) {
|
|
diff --git a/src/compiler/nir/tests/negative_equal_tests.cpp b/src/compiler/nir/tests/negative_equal_tests.cpp
|
|
index f83041a4fbf..76472e48309 100644
|
|
--- a/src/compiler/nir/tests/negative_equal_tests.cpp
|
|
+++ b/src/compiler/nir/tests/negative_equal_tests.cpp
|
|
@@ -36,6 +36,7 @@ protected:
|
|
const_value_negative_equal_test()
|
|
{
|
|
glsl_type_singleton_init_or_ref();
|
|
+ util_cpu_detect();
|
|
|
|
memset(c1, 0, sizeof(c1));
|
|
memset(c2, 0, sizeof(c2));
|
|
@@ -55,6 +56,7 @@ protected:
|
|
alu_srcs_negative_equal_test()
|
|
{
|
|
glsl_type_singleton_init_or_ref();
|
|
+ util_cpu_detect();
|
|
|
|
static const nir_shader_compiler_options options = { };
|
|
nir_builder_init_simple_shader(&bld, NULL, MESA_SHADER_VERTEX, &options);
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
|
|
index 165d73d94fc..33269e528fe 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
|
|
@@ -104,13 +104,13 @@ lp_build_min_simple(struct lp_build_context *bld,
|
|
|
|
/* TODO: optimize the constant case */
|
|
|
|
- if (type.floating && util_cpu_caps.has_sse) {
|
|
+ if (type.floating && util_get_cpu_caps()->has_sse) {
|
|
if (type.width == 32) {
|
|
if (type.length == 1) {
|
|
intrinsic = "llvm.x86.sse.min.ss";
|
|
intr_size = 128;
|
|
}
|
|
- else if (type.length <= 4 || !util_cpu_caps.has_avx) {
|
|
+ else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
|
|
intrinsic = "llvm.x86.sse.min.ps";
|
|
intr_size = 128;
|
|
}
|
|
@@ -119,12 +119,12 @@ lp_build_min_simple(struct lp_build_context *bld,
|
|
intr_size = 256;
|
|
}
|
|
}
|
|
- if (type.width == 64 && util_cpu_caps.has_sse2) {
|
|
+ if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
|
|
if (type.length == 1) {
|
|
intrinsic = "llvm.x86.sse2.min.sd";
|
|
intr_size = 128;
|
|
}
|
|
- else if (type.length == 2 || !util_cpu_caps.has_avx) {
|
|
+ else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
|
|
intrinsic = "llvm.x86.sse2.min.pd";
|
|
intr_size = 128;
|
|
}
|
|
@@ -134,7 +134,7 @@ lp_build_min_simple(struct lp_build_context *bld,
|
|
}
|
|
}
|
|
}
|
|
- else if (type.floating && util_cpu_caps.has_altivec) {
|
|
+ else if (type.floating && util_get_cpu_caps()->has_altivec) {
|
|
if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
|
|
nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
|
|
debug_printf("%s: altivec doesn't support nan return nan behavior\n",
|
|
@@ -144,7 +144,7 @@ lp_build_min_simple(struct lp_build_context *bld,
|
|
intrinsic = "llvm.ppc.altivec.vminfp";
|
|
intr_size = 128;
|
|
}
|
|
- } else if (util_cpu_caps.has_altivec) {
|
|
+ } else if (util_get_cpu_caps()->has_altivec) {
|
|
intr_size = 128;
|
|
if (type.width == 8) {
|
|
if (!type.sign) {
|
|
@@ -174,7 +174,7 @@ lp_build_min_simple(struct lp_build_context *bld,
|
|
* The sse intrinsics return the second operator in case of nan by
|
|
* default so we need to special code to handle those.
|
|
*/
|
|
- if (util_cpu_caps.has_sse && type.floating &&
|
|
+ if (util_get_cpu_caps()->has_sse && type.floating &&
|
|
nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
|
|
nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
|
|
nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
|
|
@@ -274,13 +274,13 @@ lp_build_max_simple(struct lp_build_context *bld,
|
|
|
|
/* TODO: optimize the constant case */
|
|
|
|
- if (type.floating && util_cpu_caps.has_sse) {
|
|
+ if (type.floating && util_get_cpu_caps()->has_sse) {
|
|
if (type.width == 32) {
|
|
if (type.length == 1) {
|
|
intrinsic = "llvm.x86.sse.max.ss";
|
|
intr_size = 128;
|
|
}
|
|
- else if (type.length <= 4 || !util_cpu_caps.has_avx) {
|
|
+ else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
|
|
intrinsic = "llvm.x86.sse.max.ps";
|
|
intr_size = 128;
|
|
}
|
|
@@ -289,12 +289,12 @@ lp_build_max_simple(struct lp_build_context *bld,
|
|
intr_size = 256;
|
|
}
|
|
}
|
|
- if (type.width == 64 && util_cpu_caps.has_sse2) {
|
|
+ if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
|
|
if (type.length == 1) {
|
|
intrinsic = "llvm.x86.sse2.max.sd";
|
|
intr_size = 128;
|
|
}
|
|
- else if (type.length == 2 || !util_cpu_caps.has_avx) {
|
|
+ else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
|
|
intrinsic = "llvm.x86.sse2.max.pd";
|
|
intr_size = 128;
|
|
}
|
|
@@ -304,7 +304,7 @@ lp_build_max_simple(struct lp_build_context *bld,
|
|
}
|
|
}
|
|
}
|
|
- else if (type.floating && util_cpu_caps.has_altivec) {
|
|
+ else if (type.floating && util_get_cpu_caps()->has_altivec) {
|
|
if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
|
|
nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
|
|
debug_printf("%s: altivec doesn't support nan return nan behavior\n",
|
|
@@ -314,7 +314,7 @@ lp_build_max_simple(struct lp_build_context *bld,
|
|
intrinsic = "llvm.ppc.altivec.vmaxfp";
|
|
intr_size = 128;
|
|
}
|
|
- } else if (util_cpu_caps.has_altivec) {
|
|
+ } else if (util_get_cpu_caps()->has_altivec) {
|
|
intr_size = 128;
|
|
if (type.width == 8) {
|
|
if (!type.sign) {
|
|
@@ -338,7 +338,7 @@ lp_build_max_simple(struct lp_build_context *bld,
|
|
}
|
|
|
|
if (intrinsic) {
|
|
- if (util_cpu_caps.has_sse && type.floating &&
|
|
+ if (util_get_cpu_caps()->has_sse && type.floating &&
|
|
nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
|
|
nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
|
|
nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
|
|
@@ -472,12 +472,12 @@ lp_build_add(struct lp_build_context *bld,
|
|
return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
|
|
}
|
|
if (type.width * type.length == 128) {
|
|
- if (util_cpu_caps.has_sse2) {
|
|
+ if (util_get_cpu_caps()->has_sse2) {
|
|
if (type.width == 8)
|
|
intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
|
|
if (type.width == 16)
|
|
intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
|
|
- } else if (util_cpu_caps.has_altivec) {
|
|
+ } else if (util_get_cpu_caps()->has_altivec) {
|
|
if (type.width == 8)
|
|
intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
|
|
if (type.width == 16)
|
|
@@ -485,7 +485,7 @@ lp_build_add(struct lp_build_context *bld,
|
|
}
|
|
}
|
|
if (type.width * type.length == 256) {
|
|
- if (util_cpu_caps.has_avx2) {
|
|
+ if (util_get_cpu_caps()->has_avx2) {
|
|
if (type.width == 8)
|
|
intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
|
|
if (type.width == 16)
|
|
@@ -713,11 +713,11 @@ lp_build_hadd_partial4(struct lp_build_context *bld,
|
|
tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
|
|
tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
|
|
|
|
- if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
|
|
+ if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
|
|
bld->type.length == 4) {
|
|
intrinsic = "llvm.x86.sse3.hadd.ps";
|
|
}
|
|
- else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
|
|
+ else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
|
|
bld->type.length == 8) {
|
|
intrinsic = "llvm.x86.avx.hadd.ps.256";
|
|
}
|
|
@@ -796,12 +796,12 @@ lp_build_sub(struct lp_build_context *bld,
|
|
return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
|
|
}
|
|
if (type.width * type.length == 128) {
|
|
- if (util_cpu_caps.has_sse2) {
|
|
+ if (util_get_cpu_caps()->has_sse2) {
|
|
if (type.width == 8)
|
|
intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
|
|
if (type.width == 16)
|
|
intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
|
|
- } else if (util_cpu_caps.has_altivec) {
|
|
+ } else if (util_get_cpu_caps()->has_altivec) {
|
|
if (type.width == 8)
|
|
intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
|
|
if (type.width == 16)
|
|
@@ -809,7 +809,7 @@ lp_build_sub(struct lp_build_context *bld,
|
|
}
|
|
}
|
|
if (type.width * type.length == 256) {
|
|
- if (util_cpu_caps.has_avx2) {
|
|
+ if (util_get_cpu_caps()->has_avx2) {
|
|
if (type.width == 8)
|
|
intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
|
|
if (type.width == 16)
|
|
@@ -1078,8 +1078,8 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
|
|
*/
|
|
if (LLVM_VERSION_MAJOR < 7 &&
|
|
(bld->type.length == 4 || bld->type.length == 8) &&
|
|
- ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
|
|
- util_cpu_caps.has_sse4_1)) {
|
|
+ ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
|
|
+ util_get_cpu_caps()->has_sse4_1)) {
|
|
const char *intrinsic = NULL;
|
|
LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
|
|
LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
|
|
@@ -1096,7 +1096,7 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
|
|
aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
|
|
bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
|
|
|
|
- if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
|
|
+ if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
|
|
if (bld->type.sign) {
|
|
intrinsic = "llvm.x86.avx2.pmul.dq";
|
|
} else {
|
|
@@ -1331,8 +1331,8 @@ lp_build_div(struct lp_build_context *bld,
|
|
|
|
/* fast rcp is disabled (just uses div), so makes no sense to try that */
|
|
if(FALSE &&
|
|
- ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
|
|
- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
|
|
+ ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
|
|
+ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
|
|
type.floating)
|
|
return lp_build_mul(bld, a, lp_build_rcp(bld, b));
|
|
|
|
@@ -1745,7 +1745,7 @@ lp_build_abs(struct lp_build_context *bld,
|
|
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
|
|
}
|
|
|
|
- if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
|
|
+ if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
|
|
switch(type.width) {
|
|
case 8:
|
|
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
|
|
@@ -1755,7 +1755,7 @@ lp_build_abs(struct lp_build_context *bld,
|
|
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
|
|
}
|
|
}
|
|
- else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
|
|
+ else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
|
|
switch(type.width) {
|
|
case 8:
|
|
return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
|
|
@@ -1897,15 +1897,15 @@ lp_build_int_to_float(struct lp_build_context *bld,
|
|
static boolean
|
|
arch_rounding_available(const struct lp_type type)
|
|
{
|
|
- if ((util_cpu_caps.has_sse4_1 &&
|
|
+ if ((util_get_cpu_caps()->has_sse4_1 &&
|
|
(type.length == 1 || type.width*type.length == 128)) ||
|
|
- (util_cpu_caps.has_avx && type.width*type.length == 256) ||
|
|
- (util_cpu_caps.has_avx512f && type.width*type.length == 512))
|
|
+ (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
|
|
+ (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
|
|
return TRUE;
|
|
- else if ((util_cpu_caps.has_altivec &&
|
|
+ else if ((util_get_cpu_caps()->has_altivec &&
|
|
(type.width == 32 && type.length == 4)))
|
|
return TRUE;
|
|
- else if (util_cpu_caps.has_neon)
|
|
+ else if (util_get_cpu_caps()->has_neon)
|
|
return TRUE;
|
|
|
|
return FALSE;
|
|
@@ -1935,7 +1935,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
|
|
assert(type.width == 32);
|
|
|
|
assert(lp_check_value(type, a));
|
|
- assert(util_cpu_caps.has_sse2);
|
|
+ assert(util_get_cpu_caps()->has_sse2);
|
|
|
|
/* This is relying on MXCSR rounding mode, which should always be nearest. */
|
|
if (type.length == 1) {
|
|
@@ -1961,7 +1961,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
|
|
}
|
|
else {
|
|
assert(type.width*type.length == 256);
|
|
- assert(util_cpu_caps.has_avx);
|
|
+ assert(util_get_cpu_caps()->has_avx);
|
|
|
|
intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
|
|
}
|
|
@@ -1987,7 +1987,7 @@ lp_build_round_altivec(struct lp_build_context *bld,
|
|
assert(type.floating);
|
|
|
|
assert(lp_check_value(type, a));
|
|
- assert(util_cpu_caps.has_altivec);
|
|
+ assert(util_get_cpu_caps()->has_altivec);
|
|
|
|
(void)type;
|
|
|
|
@@ -2014,7 +2014,7 @@ lp_build_round_arch(struct lp_build_context *bld,
|
|
LLVMValueRef a,
|
|
enum lp_build_round_mode mode)
|
|
{
|
|
- if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
|
|
+ if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) {
|
|
LLVMBuilderRef builder = bld->gallivm->builder;
|
|
const struct lp_type type = bld->type;
|
|
const char *intrinsic_root;
|
|
@@ -2042,7 +2042,7 @@ lp_build_round_arch(struct lp_build_context *bld,
|
|
lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
|
|
return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
|
|
}
|
|
- else /* (util_cpu_caps.has_altivec) */
|
|
+ else /* (util_get_cpu_caps()->has_altivec) */
|
|
return lp_build_round_altivec(bld, a, mode);
|
|
}
|
|
|
|
@@ -2377,9 +2377,9 @@ lp_build_iround(struct lp_build_context *bld,
|
|
|
|
assert(lp_check_value(type, a));
|
|
|
|
- if ((util_cpu_caps.has_sse2 &&
|
|
+ if ((util_get_cpu_caps()->has_sse2 &&
|
|
((type.width == 32) && (type.length == 1 || type.length == 4))) ||
|
|
- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
|
|
+ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
|
|
return lp_build_iround_nearest_sse2(bld, a);
|
|
}
|
|
if (arch_rounding_available(type)) {
|
|
@@ -2664,8 +2664,8 @@ lp_build_rcp(struct lp_build_context *bld,
|
|
* particular uses that require less workarounds.
|
|
*/
|
|
|
|
- if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
|
|
- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
|
|
+ if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
|
|
+ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
|
|
const unsigned num_iterations = 0;
|
|
LLVMValueRef res;
|
|
unsigned i;
|
|
@@ -2784,8 +2784,8 @@ lp_build_fast_rsqrt_available(struct lp_type type)
|
|
{
|
|
assert(type.floating);
|
|
|
|
- if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
|
|
- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
|
|
+ if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
|
|
+ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
|
|
return true;
|
|
}
|
|
return false;
|
|
@@ -3694,7 +3694,7 @@ lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
|
|
LLVMValueRef
|
|
lp_build_fpstate_get(struct gallivm_state *gallivm)
|
|
{
|
|
- if (util_cpu_caps.has_sse) {
|
|
+ if (util_get_cpu_caps()->has_sse) {
|
|
LLVMBuilderRef builder = gallivm->builder;
|
|
LLVMValueRef mxcsr_ptr = lp_build_alloca(
|
|
gallivm,
|
|
@@ -3715,7 +3715,7 @@ void
|
|
lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
|
|
boolean zero)
|
|
{
|
|
- if (util_cpu_caps.has_sse) {
|
|
+ if (util_get_cpu_caps()->has_sse) {
|
|
/* turn on DAZ (64) | FTZ (32768) = 32832 if available */
|
|
int daz_ftz = _MM_FLUSH_ZERO_MASK;
|
|
|
|
@@ -3724,7 +3724,7 @@ lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
|
|
LLVMValueRef mxcsr =
|
|
LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
|
|
|
|
- if (util_cpu_caps.has_daz) {
|
|
+ if (util_get_cpu_caps()->has_daz) {
|
|
/* Enable denormals are zero mode */
|
|
daz_ftz |= _MM_DENORMALS_ZERO_MASK;
|
|
}
|
|
@@ -3745,7 +3745,7 @@ void
|
|
lp_build_fpstate_set(struct gallivm_state *gallivm,
|
|
LLVMValueRef mxcsr_ptr)
|
|
{
|
|
- if (util_cpu_caps.has_sse) {
|
|
+ if (util_get_cpu_caps()->has_sse) {
|
|
LLVMBuilderRef builder = gallivm->builder;
|
|
mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
|
|
LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
|
|
index c68b8850473..af445b00c1a 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
|
|
@@ -101,7 +101,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
|
|
LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
|
|
LLVMValueRef h;
|
|
|
|
- if (util_cpu_caps.has_f16c &&
|
|
+ if (util_get_cpu_caps()->has_f16c &&
|
|
(src_length == 4 || src_length == 8)) {
|
|
if (LLVM_VERSION_MAJOR < 11) {
|
|
const char *intrinsic = NULL;
|
|
@@ -167,7 +167,7 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
|
|
* useless.
|
|
*/
|
|
|
|
- if (util_cpu_caps.has_f16c &&
|
|
+ if (util_get_cpu_caps()->has_f16c &&
|
|
(length == 4 || length == 8)) {
|
|
struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
|
|
unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
|
|
@@ -489,7 +489,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
|
|
|
|
/* Special case 4x4x32 --> 1x16x8 */
|
|
if (src_type.length == 4 &&
|
|
- (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
|
|
+ (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))
|
|
{
|
|
num_dsts = (num_srcs + 3) / 4;
|
|
dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
|
|
@@ -500,7 +500,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
|
|
|
|
/* Special case 2x8x32 --> 1x16x8 */
|
|
if (src_type.length == 8 &&
|
|
- util_cpu_caps.has_avx)
|
|
+ util_get_cpu_caps()->has_avx)
|
|
{
|
|
num_dsts = (num_srcs + 1) / 2;
|
|
dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
|
|
@@ -597,7 +597,7 @@ lp_build_conv(struct gallivm_state *gallivm,
|
|
((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
|
|
(num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
|
|
|
|
- (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
|
|
+ (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))
|
|
{
|
|
struct lp_build_context bld;
|
|
struct lp_type int16_type, int32_type;
|
|
@@ -710,7 +710,7 @@ lp_build_conv(struct gallivm_state *gallivm,
|
|
((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
|
|
(num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
|
|
|
|
- util_cpu_caps.has_avx) {
|
|
+ util_get_cpu_caps()->has_avx) {
|
|
|
|
struct lp_build_context bld;
|
|
struct lp_type int16_type, int32_type;
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
|
|
index 174857e06d9..e17c7881e7d 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
|
|
@@ -642,8 +642,8 @@ s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
|
|
* XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
|
|
* Much cheaper (but we don't care that much if n == 1).
|
|
*/
|
|
- if ((util_cpu_caps.has_sse2 && n == 4) ||
|
|
- (util_cpu_caps.has_avx2 && n == 8)) {
|
|
+ if ((util_get_cpu_caps()->has_sse2 && n == 4) ||
|
|
+ (util_get_cpu_caps()->has_avx2 && n == 8)) {
|
|
color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
|
|
color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
|
|
}
|
|
@@ -1350,7 +1350,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
|
|
if (is_dxt1_variant) {
|
|
LLVMValueRef color23_2, color2_2;
|
|
|
|
- if (util_cpu_caps.has_sse2) {
|
|
+ if (util_get_cpu_caps()->has_sse2) {
|
|
LLVMValueRef intrargs[2];
|
|
intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
|
|
/* same interleave as for lerp23 - correct result in 2nd element */
|
|
@@ -1389,7 +1389,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
|
|
color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
|
|
}
|
|
|
|
- if (util_cpu_caps.has_ssse3) {
|
|
+ if (util_get_cpu_caps()->has_ssse3) {
|
|
/*
|
|
* Use pshufb as mini-lut. (Only doable with intrinsics as the
|
|
* final shuffles are non-constant. pshufb is awesome!)
|
|
@@ -1689,7 +1689,7 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
|
|
type16.sign = FALSE;
|
|
sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
|
|
|
|
- if (!util_cpu_caps.has_ssse3) {
|
|
+ if (!util_get_cpu_caps()->has_ssse3) {
|
|
LLVMValueRef acodeg, mask1, acode0, acode1;
|
|
|
|
/* extraction of the 3 bit values into something more useful is HARD */
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
|
|
index 121452d7596..97deffe1de0 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
|
|
@@ -90,7 +90,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm,
|
|
* per element. Didn't measure performance but cuts shader size
|
|
* by quite a bit (less difference if cpu has no sse4.1 support).
|
|
*/
|
|
- if (util_cpu_caps.has_sse2 && n > 1) {
|
|
+ if (util_get_cpu_caps()->has_sse2 && n > 1) {
|
|
LLVMValueRef sel, tmp, tmp2;
|
|
struct lp_build_context bld32;
|
|
|
|
@@ -174,7 +174,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
|
|
* per element. Didn't measure performance but cuts shader size
|
|
* by quite a bit (less difference if cpu has no sse4.1 support).
|
|
*/
|
|
- if (util_cpu_caps.has_sse2 && n > 1) {
|
|
+ if (util_get_cpu_caps()->has_sse2 && n > 1) {
|
|
LLVMValueRef sel, tmp;
|
|
struct lp_build_context bld32;
|
|
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
|
|
index e991b0dc375..42cc17371a0 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
|
|
@@ -488,7 +488,7 @@ lp_build_gather(struct gallivm_state *gallivm,
|
|
* 32bit/64bit fetches you're doing it wrong (this is gather, not
|
|
* conversion) and it would be awkward for floats.
|
|
*/
|
|
- } else if (util_cpu_caps.has_avx2 && !need_expansion &&
|
|
+ } else if (util_get_cpu_caps()->has_avx2 && !need_expansion &&
|
|
src_width == 32 && (length == 4 || length == 8)) {
|
|
return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
|
|
base_ptr, offsets);
|
|
@@ -500,7 +500,7 @@ lp_build_gather(struct gallivm_state *gallivm,
|
|
* (In general, should be more of a win if the fetch is 256bit wide -
|
|
* this is true for the 32bit case above too.)
|
|
*/
|
|
- } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
|
|
+ } else if (0 && util_get_cpu_caps()->has_avx2 && !need_expansion &&
|
|
src_width == 64 && (length == 2 || length == 4)) {
|
|
return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
|
|
base_ptr, offsets);
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
|
|
index 685ed0e58aa..dd428242cb9 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
|
|
@@ -433,6 +433,7 @@ lp_build_init(void)
|
|
/* For simulating less capable machines */
|
|
#ifdef DEBUG
|
|
if (debug_get_bool_option("LP_FORCE_SSE2", FALSE)) {
|
|
+ extern struct util_cpu_caps_t util_cpu_caps;
|
|
assert(util_cpu_caps.has_sse2);
|
|
util_cpu_caps.has_sse3 = 0;
|
|
util_cpu_caps.has_ssse3 = 0;
|
|
@@ -445,7 +446,7 @@ lp_build_init(void)
|
|
}
|
|
#endif
|
|
|
|
- if (util_cpu_caps.has_avx2 || util_cpu_caps.has_avx) {
|
|
+ if (util_get_cpu_caps()->has_avx2 || util_get_cpu_caps()->has_avx) {
|
|
lp_native_vector_width = 256;
|
|
} else {
|
|
/* Leave it at 128, even when no SIMD extensions are available.
|
|
@@ -460,16 +461,16 @@ lp_build_init(void)
|
|
#if LLVM_VERSION_MAJOR < 4
|
|
if (lp_native_vector_width <= 128) {
|
|
/* Hide AVX support, as often LLVM AVX intrinsics are only guarded by
|
|
- * "util_cpu_caps.has_avx" predicate, and lack the
|
|
+ * "util_get_cpu_caps()->has_avx" predicate, and lack the
|
|
* "lp_native_vector_width > 128" predicate. And also to ensure a more
|
|
* consistent behavior, allowing one to test SSE2 on AVX machines.
|
|
* XXX: should not play games with util_cpu_caps directly as it might
|
|
* get used for other things outside llvm too.
|
|
*/
|
|
- util_cpu_caps.has_avx = 0;
|
|
- util_cpu_caps.has_avx2 = 0;
|
|
- util_cpu_caps.has_f16c = 0;
|
|
- util_cpu_caps.has_fma = 0;
|
|
+ util_get_cpu_caps()->has_avx = 0;
|
|
+ util_get_cpu_caps()->has_avx2 = 0;
|
|
+ util_get_cpu_caps()->has_f16c = 0;
|
|
+ util_get_cpu_caps()->has_fma = 0;
|
|
}
|
|
#endif
|
|
|
|
@@ -482,7 +483,7 @@ lp_build_init(void)
|
|
* Right now denorms get explicitly disabled (but elsewhere) for x86,
|
|
* whereas ppc64 explicitly enables them...
|
|
*/
|
|
- if (util_cpu_caps.has_altivec) {
|
|
+ if (util_get_cpu_caps()->has_altivec) {
|
|
unsigned short mask[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
|
|
0xFFFF, 0xFFFF, 0xFFFE, 0xFFFF };
|
|
__asm (
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
|
|
index 315977ae745..3ed3b5a74b1 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
|
|
@@ -196,7 +196,7 @@ lp_build_compare(struct gallivm_state *gallivm,
|
|
|
|
if (!type.floating && !type.sign &&
|
|
type.width * type.length == 128 &&
|
|
- util_cpu_caps.has_sse2 &&
|
|
+ util_get_cpu_caps()->has_sse2 &&
|
|
(func == PIPE_FUNC_LESS ||
|
|
func == PIPE_FUNC_LEQUAL ||
|
|
func == PIPE_FUNC_GREATER ||
|
|
@@ -348,11 +348,11 @@ lp_build_select(struct lp_build_context *bld,
|
|
|
|
res = LLVMBuildSelect(builder, mask, a, b, "");
|
|
}
|
|
- else if (((util_cpu_caps.has_sse4_1 &&
|
|
+ else if (((util_get_cpu_caps()->has_sse4_1 &&
|
|
type.width * type.length == 128) ||
|
|
- (util_cpu_caps.has_avx &&
|
|
+ (util_get_cpu_caps()->has_avx &&
|
|
type.width * type.length == 256 && type.width >= 32) ||
|
|
- (util_cpu_caps.has_avx2 &&
|
|
+ (util_get_cpu_caps()->has_avx2 &&
|
|
type.width * type.length == 256)) &&
|
|
!LLVMIsConstant(a) &&
|
|
!LLVMIsConstant(b) &&
|
|
@@ -379,7 +379,7 @@ lp_build_select(struct lp_build_context *bld,
|
|
intrinsic = "llvm.x86.avx.blendv.ps.256";
|
|
arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
|
|
} else {
|
|
- assert(util_cpu_caps.has_avx2);
|
|
+ assert(util_get_cpu_caps()->has_avx2);
|
|
intrinsic = "llvm.x86.avx2.pblendvb";
|
|
arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32);
|
|
}
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
|
|
index 9b75676a4e2..4f3e696816c 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
|
|
@@ -400,22 +400,22 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
|
|
* http://llvm.org/PR19429
|
|
* http://llvm.org/PR16721
|
|
*/
|
|
- MAttrs.push_back(util_cpu_caps.has_sse ? "+sse" : "-sse" );
|
|
- MAttrs.push_back(util_cpu_caps.has_sse2 ? "+sse2" : "-sse2" );
|
|
- MAttrs.push_back(util_cpu_caps.has_sse3 ? "+sse3" : "-sse3" );
|
|
- MAttrs.push_back(util_cpu_caps.has_ssse3 ? "+ssse3" : "-ssse3" );
|
|
- MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse4.1" : "-sse4.1");
|
|
- MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse4.2" : "-sse4.2");
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_sse ? "+sse" : "-sse" );
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_sse2 ? "+sse2" : "-sse2" );
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_sse3 ? "+sse3" : "-sse3" );
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_ssse3 ? "+ssse3" : "-ssse3" );
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_sse4_1 ? "+sse4.1" : "-sse4.1");
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_sse4_2 ? "+sse4.2" : "-sse4.2");
|
|
/*
|
|
* AVX feature is not automatically detected from CPUID by the X86 target
|
|
* yet, because the old (yet default) JIT engine is not capable of
|
|
* emitting the opcodes. On newer llvm versions it is and at least some
|
|
* versions (tested with 3.3) will emit avx opcodes without this anyway.
|
|
*/
|
|
- MAttrs.push_back(util_cpu_caps.has_avx ? "+avx" : "-avx");
|
|
- MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c");
|
|
- MAttrs.push_back(util_cpu_caps.has_fma ? "+fma" : "-fma");
|
|
- MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2");
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_avx ? "+avx" : "-avx");
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_f16c ? "+f16c" : "-f16c");
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_fma ? "+fma" : "-fma");
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_avx2 ? "+avx2" : "-avx2");
|
|
/* disable avx512 and all subvariants */
|
|
MAttrs.push_back("-avx512cd");
|
|
MAttrs.push_back("-avx512er");
|
|
@@ -426,7 +426,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
|
|
MAttrs.push_back("-avx512vl");
|
|
#endif
|
|
#if defined(PIPE_ARCH_ARM)
|
|
- if (!util_cpu_caps.has_neon) {
|
|
+ if (!util_get_cpu_caps()->has_neon) {
|
|
MAttrs.push_back("-neon");
|
|
MAttrs.push_back("-crypto");
|
|
MAttrs.push_back("-vfp2");
|
|
@@ -434,7 +434,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
|
|
#endif
|
|
|
|
#if defined(PIPE_ARCH_PPC)
|
|
- MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_altivec ? "+altivec" : "-altivec");
|
|
#if (LLVM_VERSION_MAJOR < 4)
|
|
/*
|
|
* Make sure VSX instructions are disabled
|
|
@@ -444,7 +444,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
|
|
* https://llvm.org/bugs/show_bug.cgi?id=33531 (fixed in 4.0)
|
|
* https://llvm.org/bugs/show_bug.cgi?id=34647 (llc performance on certain unusual shader IR; intro'd in 4.0, pending as of 5.0)
|
|
*/
|
|
- if (util_cpu_caps.has_altivec) {
|
|
+ if (util_get_cpu_caps()->has_altivec) {
|
|
MAttrs.push_back("-vsx");
|
|
}
|
|
#else
|
|
@@ -458,8 +458,8 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
|
|
* Make sure VSX instructions are ENABLED (if supported), unless
|
|
* VSX instructions are explicitly enabled/disabled via GALLIVM_VSX=1 or 0.
|
|
*/
|
|
- if (util_cpu_caps.has_altivec) {
|
|
- MAttrs.push_back(util_cpu_caps.has_vsx ? "+vsx" : "-vsx");
|
|
+ if (util_get_cpu_caps()->has_altivec) {
|
|
+ MAttrs.push_back(util_get_cpu_caps()->has_vsx ? "+vsx" : "-vsx");
|
|
}
|
|
#endif
|
|
#endif
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
|
|
index e1f652a9342..76e57c52f80 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
|
|
@@ -322,7 +322,7 @@ lp_build_interleave2(struct gallivm_state *gallivm,
|
|
{
|
|
LLVMValueRef shuffle;
|
|
|
|
- if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
|
|
+ if (type.length == 2 && type.width == 128 && util_get_cpu_caps()->has_avx) {
|
|
/*
|
|
* XXX: This is a workaround for llvm code generation deficiency. Strangely
|
|
* enough, while this needs vinsertf128/vextractf128 instructions (hence
|
|
@@ -484,7 +484,7 @@ lp_build_unpack2_native(struct gallivm_state *gallivm,
|
|
|
|
/* Interleave bits */
|
|
#if UTIL_ARCH_LITTLE_ENDIAN
|
|
- if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) {
|
|
+ if (src_type.length * src_type.width == 256 && util_get_cpu_caps()->has_avx2) {
|
|
*dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
|
|
*dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
|
|
} else {
|
|
@@ -585,22 +585,22 @@ lp_build_pack2(struct gallivm_state *gallivm,
|
|
assert(src_type.length * 2 == dst_type.length);
|
|
|
|
/* Check for special cases first */
|
|
- if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
|
|
+ if ((util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec) &&
|
|
src_type.width * src_type.length >= 128) {
|
|
const char *intrinsic = NULL;
|
|
boolean swap_intrinsic_operands = FALSE;
|
|
|
|
switch(src_type.width) {
|
|
case 32:
|
|
- if (util_cpu_caps.has_sse2) {
|
|
+ if (util_get_cpu_caps()->has_sse2) {
|
|
if (dst_type.sign) {
|
|
intrinsic = "llvm.x86.sse2.packssdw.128";
|
|
} else {
|
|
- if (util_cpu_caps.has_sse4_1) {
|
|
+ if (util_get_cpu_caps()->has_sse4_1) {
|
|
intrinsic = "llvm.x86.sse41.packusdw";
|
|
}
|
|
}
|
|
- } else if (util_cpu_caps.has_altivec) {
|
|
+ } else if (util_get_cpu_caps()->has_altivec) {
|
|
if (dst_type.sign) {
|
|
intrinsic = "llvm.ppc.altivec.vpkswss";
|
|
} else {
|
|
@@ -613,18 +613,18 @@ lp_build_pack2(struct gallivm_state *gallivm,
|
|
break;
|
|
case 16:
|
|
if (dst_type.sign) {
|
|
- if (util_cpu_caps.has_sse2) {
|
|
+ if (util_get_cpu_caps()->has_sse2) {
|
|
intrinsic = "llvm.x86.sse2.packsswb.128";
|
|
- } else if (util_cpu_caps.has_altivec) {
|
|
+ } else if (util_get_cpu_caps()->has_altivec) {
|
|
intrinsic = "llvm.ppc.altivec.vpkshss";
|
|
#if UTIL_ARCH_LITTLE_ENDIAN
|
|
swap_intrinsic_operands = TRUE;
|
|
#endif
|
|
}
|
|
} else {
|
|
- if (util_cpu_caps.has_sse2) {
|
|
+ if (util_get_cpu_caps()->has_sse2) {
|
|
intrinsic = "llvm.x86.sse2.packuswb.128";
|
|
- } else if (util_cpu_caps.has_altivec) {
|
|
+ } else if (util_get_cpu_caps()->has_altivec) {
|
|
intrinsic = "llvm.ppc.altivec.vpkshus";
|
|
#if UTIL_ARCH_LITTLE_ENDIAN
|
|
swap_intrinsic_operands = TRUE;
|
|
@@ -740,7 +740,7 @@ lp_build_pack2_native(struct gallivm_state *gallivm,
|
|
|
|
/* At this point only have special case for avx2 */
|
|
if (src_type.length * src_type.width == 256 &&
|
|
- util_cpu_caps.has_avx2) {
|
|
+ util_get_cpu_caps()->has_avx2) {
|
|
switch(src_type.width) {
|
|
case 32:
|
|
if (dst_type.sign) {
|
|
@@ -793,7 +793,7 @@ lp_build_packs2(struct gallivm_state *gallivm,
|
|
|
|
/* All X86 SSE non-interleaved pack instructions take signed inputs and
|
|
* saturate them, so no need to clamp for those cases. */
|
|
- if(util_cpu_caps.has_sse2 &&
|
|
+ if(util_get_cpu_caps()->has_sse2 &&
|
|
src_type.width * src_type.length >= 128 &&
|
|
src_type.sign &&
|
|
(src_type.width == 32 || src_type.width == 16))
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
|
|
index 686abc08620..98dcde912b5 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
|
|
@@ -1152,7 +1152,7 @@ lp_build_minify(struct lp_build_context *bld,
|
|
LLVMValueRef size;
|
|
assert(bld->type.sign);
|
|
if (lod_scalar ||
|
|
- (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
|
|
+ (util_get_cpu_caps()->has_avx2 || !util_get_cpu_caps()->has_sse)) {
|
|
size = LLVMBuildLShr(builder, base_size, level, "minify");
|
|
size = lp_build_max(bld, size, bld->one);
|
|
}
|
|
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
|
|
index 2b91edd37c7..6e47640e70d 100644
|
|
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
|
|
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
|
|
@@ -3234,7 +3234,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
|
|
* as it appears to be a loss with just AVX)
|
|
*/
|
|
if (num_quads == 1 || !use_aos ||
|
|
- (util_cpu_caps.has_avx2 &&
|
|
+ (util_get_cpu_caps()->has_avx2 &&
|
|
(bld.num_lods == 1 ||
|
|
derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
|
|
if (use_aos) {
|
|
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
|
|
index b1c8b990ef1..03b11f914b4 100644
|
|
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
|
|
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
|
|
@@ -35,10 +35,10 @@
|
|
|
|
DEBUG_GET_ONCE_BOOL_OPTION(nosse, "GALLIUM_NOSSE", false);
|
|
|
|
-static struct util_cpu_caps *get_cpu_caps(void)
|
|
+static const struct util_cpu_caps_t *get_cpu_caps(void)
|
|
{
|
|
util_cpu_detect();
|
|
- return &util_cpu_caps;
|
|
+ return util_get_cpu_caps();
|
|
}
|
|
|
|
int rtasm_cpu_has_sse(void)
|
|
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
|
|
index ad687f32853..ddd65fb6a08 100644
|
|
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
|
|
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
|
|
@@ -2152,17 +2152,17 @@ static void x86_init_func_common( struct x86_function *p )
|
|
{
|
|
util_cpu_detect();
|
|
p->caps = 0;
|
|
- if(util_cpu_caps.has_mmx)
|
|
+ if(util_get_cpu_caps()->has_mmx)
|
|
p->caps |= X86_MMX;
|
|
- if(util_cpu_caps.has_mmx2)
|
|
+ if(util_get_cpu_caps()->has_mmx2)
|
|
p->caps |= X86_MMX2;
|
|
- if(util_cpu_caps.has_sse)
|
|
+ if(util_get_cpu_caps()->has_sse)
|
|
p->caps |= X86_SSE;
|
|
- if(util_cpu_caps.has_sse2)
|
|
+ if(util_get_cpu_caps()->has_sse2)
|
|
p->caps |= X86_SSE2;
|
|
- if(util_cpu_caps.has_sse3)
|
|
+ if(util_get_cpu_caps()->has_sse3)
|
|
p->caps |= X86_SSE3;
|
|
- if(util_cpu_caps.has_sse4_1)
|
|
+ if(util_get_cpu_caps()->has_sse4_1)
|
|
p->caps |= X86_SSE4_1;
|
|
p->csr = p->store;
|
|
#if defined(PIPE_ARCH_X86)
|
|
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
|
|
index 1eaff77724e..bf56993db09 100644
|
|
--- a/src/gallium/auxiliary/util/u_threaded_context.c
|
|
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
|
|
@@ -2071,8 +2071,8 @@ tc_set_context_param(struct pipe_context *_pipe,
|
|
if (param == PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE) {
|
|
/* Pin the gallium thread as requested. */
|
|
util_set_thread_affinity(tc->queue.threads[0],
|
|
- util_cpu_caps.L3_affinity_mask[value],
|
|
- NULL, UTIL_MAX_CPUS);
|
|
+ util_get_cpu_caps()->L3_affinity_mask[value],
|
|
+ NULL, util_get_cpu_caps()->num_cpu_mask_bits);
|
|
|
|
/* Execute this immediately (without enqueuing).
|
|
* It's required to be thread-safe.
|
|
@@ -2720,7 +2720,7 @@ threaded_context_create(struct pipe_context *pipe,
|
|
|
|
util_cpu_detect();
|
|
|
|
- if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1))
|
|
+ if (!debug_get_bool_option("GALLIUM_THREAD", util_get_cpu_caps()->nr_cpus > 1))
|
|
return pipe;
|
|
|
|
tc = os_malloc_aligned(sizeof(struct threaded_context), 16);
|
|
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
|
|
index 64cf72ae101..913c1bd2462 100644
|
|
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
|
|
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
|
|
@@ -435,7 +435,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
|
|
assert(type.length <= 16);
|
|
assert(type.floating);
|
|
|
|
- if(util_cpu_caps.has_sse && type.length == 4) {
|
|
+ if(util_get_cpu_caps()->has_sse && type.length == 4) {
|
|
const char *movmskintr = "llvm.x86.sse.movmsk.ps";
|
|
const char *popcntintr = "llvm.ctpop.i32";
|
|
LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
|
|
@@ -446,7 +446,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
|
|
LLVMInt32TypeInContext(context), bits);
|
|
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
|
|
}
|
|
- else if(util_cpu_caps.has_avx && type.length == 8) {
|
|
+ else if(util_get_cpu_caps()->has_avx && type.length == 8) {
|
|
const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
|
|
const char *popcntintr = "llvm.ctpop.i32";
|
|
LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
|
|
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
|
|
index f133bbf8a4d..628a4338c1e 100644
|
|
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
|
|
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
|
|
@@ -915,7 +915,7 @@ llvmpipe_create_screen(struct sw_winsys *winsys)
|
|
|
|
screen->allow_cl = !!getenv("LP_CL");
|
|
screen->use_tgsi = (LP_DEBUG & DEBUG_TGSI_IR);
|
|
- screen->num_threads = util_cpu_caps.nr_cpus > 1 ? util_cpu_caps.nr_cpus : 0;
|
|
+ screen->num_threads = util_get_cpu_caps()->nr_cpus > 1 ? util_get_cpu_caps()->nr_cpus : 0;
|
|
#ifdef EMBEDDED_DEVICE
|
|
screen->num_threads = 0;
|
|
#endif
|
|
diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c
|
|
index 873dcf37fac..725854cc25c 100644
|
|
--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c
|
|
+++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c
|
|
@@ -382,7 +382,7 @@ flush_denorm_to_zero(float val)
|
|
fi_val.f = val;
|
|
|
|
#if defined(PIPE_ARCH_SSE)
|
|
- if (util_cpu_caps.has_sse) {
|
|
+ if (util_get_cpu_caps()->has_sse) {
|
|
if ((fi_val.ui & 0x7f800000) == 0) {
|
|
fi_val.ui &= 0xff800000;
|
|
}
|
|
@@ -458,7 +458,7 @@ test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test, unsigned
|
|
continue;
|
|
}
|
|
|
|
- if (!util_cpu_caps.has_neon &&
|
|
+ if (!util_get_cpu_caps()->has_neon &&
|
|
test->ref == &nearbyintf && length == 2 &&
|
|
ref != roundf(testval)) {
|
|
/* FIXME: The generic (non SSE) path in lp_build_iround, which is
|
|
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
|
|
index 2bf223d66f9..815736166d5 100644
|
|
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
|
|
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
|
|
@@ -85,7 +85,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
|
|
* of a block for all formats) though this should not be strictly necessary
|
|
* neither. In any case it can only affect compressed or 1d textures.
|
|
*/
|
|
- unsigned mip_align = MAX2(64, util_cpu_caps.cacheline);
|
|
+ unsigned mip_align = MAX2(64, util_get_cpu_caps()->cacheline);
|
|
|
|
assert(LP_MAX_TEXTURE_2D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
|
|
assert(LP_MAX_TEXTURE_3D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
|
|
@@ -123,7 +123,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
|
|
if (util_format_is_compressed(pt->format))
|
|
lpr->row_stride[level] = nblocksx * block_size;
|
|
else
|
|
- lpr->row_stride[level] = align(nblocksx * block_size, util_cpu_caps.cacheline);
|
|
+ lpr->row_stride[level] = align(nblocksx * block_size, util_get_cpu_caps()->cacheline);
|
|
|
|
/* if row_stride * height > LP_MAX_TEXTURE_SIZE */
|
|
if ((uint64_t)lpr->row_stride[level] * nblocksy > LP_MAX_TEXTURE_SIZE) {
|
|
diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp
|
|
index 97db7ca3e8b..d891b6b14e8 100644
|
|
--- a/src/gallium/drivers/swr/swr_loader.cpp
|
|
+++ b/src/gallium/drivers/swr/swr_loader.cpp
|
|
@@ -91,7 +91,7 @@ swr_create_screen(struct sw_winsys *winsys)
|
|
|
|
util_cpu_detect();
|
|
|
|
- if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) {
|
|
+ if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512er) {
|
|
swr_print_info("SWR detected KNL instruction support ");
|
|
#ifndef HAVE_SWR_KNL
|
|
swr_print_info("(skipping: not built).\n");
|
|
@@ -103,7 +103,7 @@ swr_create_screen(struct sw_winsys *winsys)
|
|
#endif
|
|
}
|
|
|
|
- if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512bw) {
|
|
+ if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512bw) {
|
|
swr_print_info("SWR detected SKX instruction support ");
|
|
#ifndef HAVE_SWR_SKX
|
|
swr_print_info("(skipping not built).\n");
|
|
@@ -113,7 +113,7 @@ swr_create_screen(struct sw_winsys *winsys)
|
|
#endif
|
|
}
|
|
|
|
- if (util_cpu_caps.has_avx2) {
|
|
+ if (util_get_cpu_caps()->has_avx2) {
|
|
swr_print_info("SWR detected AVX2 instruction support ");
|
|
#ifndef HAVE_SWR_AVX2
|
|
swr_print_info("(skipping not built).\n");
|
|
@@ -123,7 +123,7 @@ swr_create_screen(struct sw_winsys *winsys)
|
|
#endif
|
|
}
|
|
|
|
- if (util_cpu_caps.has_avx) {
|
|
+ if (util_get_cpu_caps()->has_avx) {
|
|
swr_print_info("SWR detected AVX instruction support ");
|
|
#ifndef HAVE_SWR_AVX
|
|
swr_print_info("(skipping not built).\n");
|
|
diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
|
|
index 66767e7f1f8..5afe32939a8 100644
|
|
--- a/src/gallium/drivers/vc4/vc4_tiling.h
|
|
+++ b/src/gallium/drivers/vc4/vc4_tiling.h
|
|
@@ -90,7 +90,7 @@ vc4_load_lt_image(void *dst, uint32_t dst_stride,
|
|
int cpp, const struct pipe_box *box)
|
|
{
|
|
#ifdef USE_ARM_ASM
|
|
- if (util_cpu_caps.has_neon) {
|
|
+ if (util_get_cpu_caps()->has_neon) {
|
|
vc4_load_lt_image_neon(dst, dst_stride, src, src_stride,
|
|
cpp, box);
|
|
return;
|
|
@@ -106,7 +106,7 @@ vc4_store_lt_image(void *dst, uint32_t dst_stride,
|
|
int cpp, const struct pipe_box *box)
|
|
{
|
|
#ifdef USE_ARM_ASM
|
|
- if (util_cpu_caps.has_neon) {
|
|
+ if (util_get_cpu_caps()->has_neon) {
|
|
vc4_store_lt_image_neon(dst, dst_stride, src, src_stride,
|
|
cpp, box);
|
|
return;
|
|
diff --git a/src/gallium/tests/unit/translate_test.c b/src/gallium/tests/unit/translate_test.c
|
|
index 4d9c4e27ebf..782f16e7f78 100644
|
|
--- a/src/gallium/tests/unit/translate_test.c
|
|
+++ b/src/gallium/tests/unit/translate_test.c
|
|
@@ -50,6 +50,7 @@ int main(int argc, char** argv)
|
|
{
|
|
struct translate *(*create_fn)(const struct translate_key *key) = 0;
|
|
|
|
+ extern struct util_cpu_caps_t util_cpu_caps;
|
|
struct translate_key key;
|
|
unsigned output_format;
|
|
unsigned input_format;
|
|
@@ -87,7 +88,7 @@ int main(int argc, char** argv)
|
|
}
|
|
else if (!strcmp(argv[1], "sse"))
|
|
{
|
|
- if(!util_cpu_caps.has_sse || !rtasm_cpu_has_sse())
|
|
+ if(!util_get_cpu_caps()->has_sse || !rtasm_cpu_has_sse())
|
|
{
|
|
printf("Error: CPU doesn't support SSE (test with qemu)\n");
|
|
return 2;
|
|
@@ -99,7 +100,7 @@ int main(int argc, char** argv)
|
|
}
|
|
else if (!strcmp(argv[1], "sse2"))
|
|
{
|
|
- if(!util_cpu_caps.has_sse2 || !rtasm_cpu_has_sse())
|
|
+ if(!util_get_cpu_caps()->has_sse2 || !rtasm_cpu_has_sse())
|
|
{
|
|
printf("Error: CPU doesn't support SSE2 (test with qemu)\n");
|
|
return 2;
|
|
@@ -110,7 +111,7 @@ int main(int argc, char** argv)
|
|
}
|
|
else if (!strcmp(argv[1], "sse3"))
|
|
{
|
|
- if(!util_cpu_caps.has_sse3 || !rtasm_cpu_has_sse())
|
|
+ if(!util_get_cpu_caps()->has_sse3 || !rtasm_cpu_has_sse())
|
|
{
|
|
printf("Error: CPU doesn't support SSE3 (test with qemu)\n");
|
|
return 2;
|
|
@@ -120,7 +121,7 @@ int main(int argc, char** argv)
|
|
}
|
|
else if (!strcmp(argv[1], "sse4.1"))
|
|
{
|
|
- if(!util_cpu_caps.has_sse4_1 || !rtasm_cpu_has_sse())
|
|
+ if(!util_get_cpu_caps()->has_sse4_1 || !rtasm_cpu_has_sse())
|
|
{
|
|
printf("Error: CPU doesn't support SSE4.1 (test with qemu)\n");
|
|
return 2;
|
|
diff --git a/src/gallium/tests/unit/u_half_test.c b/src/gallium/tests/unit/u_half_test.c
|
|
index 7f2eba9382b..4474cfb82b0 100644
|
|
--- a/src/gallium/tests/unit/u_half_test.c
|
|
+++ b/src/gallium/tests/unit/u_half_test.c
|
|
@@ -36,13 +36,14 @@ test(void)
|
|
int
|
|
main(int argc, char **argv)
|
|
{
|
|
- assert(!util_cpu_caps.has_f16c);
|
|
+ util_cpu_detect();
|
|
test();
|
|
|
|
- /* Test f16c. */
|
|
- util_cpu_detect();
|
|
- if (util_cpu_caps.has_f16c)
|
|
+ /* Test non-f16c. */
|
|
+ if (util_get_cpu_caps()->has_f16c) {
|
|
+ ((struct util_cpu_caps_t *)util_get_cpu_caps())->has_f16c = false;
|
|
test();
|
|
+ }
|
|
|
|
printf("Success!\n");
|
|
return 0;
|
|
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
|
|
index 8a0aedfed64..a18362ce6ea 100644
|
|
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
|
|
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
|
|
@@ -312,8 +312,8 @@ static void amdgpu_pin_threads_to_L3_cache(struct radeon_winsys *rws,
|
|
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
|
|
|
|
util_set_thread_affinity(ws->cs_queue.threads[0],
|
|
- util_cpu_caps.L3_affinity_mask[cache],
|
|
- NULL, UTIL_MAX_CPUS);
|
|
+ util_get_cpu_caps()->L3_affinity_mask[cache],
|
|
+ NULL, util_get_cpu_caps()->num_cpu_mask_bits);
|
|
}
|
|
|
|
static uint32_t kms_handle_hash(const void *key)
|
|
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
|
|
index f0e1b9f7df3..4430ce50466 100644
|
|
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
|
|
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
|
|
@@ -801,8 +801,8 @@ static void radeon_pin_threads_to_L3_cache(struct radeon_winsys *ws,
|
|
|
|
if (util_queue_is_initialized(&rws->cs_queue)) {
|
|
util_set_thread_affinity(rws->cs_queue.threads[0],
|
|
- util_cpu_caps.L3_affinity_mask[cache],
|
|
- NULL, UTIL_MAX_CPUS);
|
|
+ util_get_cpu_caps()->L3_affinity_mask[cache],
|
|
+ NULL, util_get_cpu_caps()->num_cpu_mask_bits);
|
|
}
|
|
}
|
|
|
|
diff --git a/src/mesa/main/glthread.c b/src/mesa/main/glthread.c
|
|
index eb8eb30cabc..c9dfef541fc 100644
|
|
--- a/src/mesa/main/glthread.c
|
|
+++ b/src/mesa/main/glthread.c
|
|
@@ -199,19 +199,20 @@ _mesa_glthread_flush_batch(struct gl_context *ctx)
|
|
/* Pin threads regularly to the same Zen CCX that the main thread is
|
|
* running on. The main thread can move between CCXs.
|
|
*/
|
|
- if (util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 &&
|
|
+ if (util_get_cpu_caps()->nr_cpus != util_get_cpu_caps()->cores_per_L3 &&
|
|
/* driver support */
|
|
ctx->Driver.PinDriverToL3Cache &&
|
|
++glthread->pin_thread_counter % 128 == 0) {
|
|
int cpu = util_get_current_cpu();
|
|
|
|
if (cpu >= 0) {
|
|
- unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
|
|
-
|
|
- util_set_thread_affinity(glthread->queue.threads[0],
|
|
- util_cpu_caps.L3_affinity_mask[L3_cache],
|
|
- NULL, UTIL_MAX_CPUS);
|
|
- ctx->Driver.PinDriverToL3Cache(ctx, L3_cache);
|
|
+ uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu];
|
|
+ if (L3_cache != U_CPU_INVALID_L3) {
|
|
+ util_set_thread_affinity(glthread->queue.threads[0],
|
|
+ util_get_cpu_caps()->L3_affinity_mask[L3_cache],
|
|
+ NULL, util_get_cpu_caps()->num_cpu_mask_bits);
|
|
+ ctx->Driver.PinDriverToL3Cache(ctx, L3_cache);
|
|
+ }
|
|
}
|
|
}
|
|
|
|
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
|
|
index 40364296664..f27fa7ff29c 100644
|
|
--- a/src/mesa/state_tracker/st_context.c
|
|
+++ b/src/mesa/state_tracker/st_context.c
|
|
@@ -815,6 +815,10 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
|
|
!st->lower_ucp;
|
|
st->shader_has_one_variant[MESA_SHADER_COMPUTE] = st->has_shareable_shaders;
|
|
|
|
+ if (util_get_cpu_caps()->cores_per_L3 == util_get_cpu_caps()->nr_cpus ||
|
|
+ !st->pipe->set_context_param)
|
|
+ st->pin_thread_counter = ST_L3_PINNING_DISABLED;
|
|
+
|
|
st->bitmap.cache.empty = true;
|
|
|
|
if (ctx->Const.ForceGLNamesReuse && ctx->Shared->RefCount == 1) {
|
|
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
|
|
index b1fda06ff3e..9ab6969de62 100644
|
|
--- a/src/mesa/state_tracker/st_context.h
|
|
+++ b/src/mesa/state_tracker/st_context.h
|
|
@@ -55,6 +55,7 @@ struct st_program;
|
|
struct st_perf_monitor_group;
|
|
struct u_upload_mgr;
|
|
|
|
+#define ST_L3_PINNING_DISABLED 0xffffffff
|
|
|
|
struct st_bitmap_cache
|
|
{
|
|
@@ -130,6 +131,9 @@ struct st_context
|
|
struct draw_stage *feedback_stage; /**< For GL_FEEDBACK rendermode */
|
|
struct draw_stage *selection_stage; /**< For GL_SELECT rendermode */
|
|
struct draw_stage *rastpos_stage; /**< For glRasterPos */
|
|
+
|
|
+ unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */
|
|
+
|
|
GLboolean clamp_frag_color_in_shader;
|
|
GLboolean clamp_vert_color_in_shader;
|
|
boolean clamp_frag_depth_in_shader;
|
|
@@ -235,8 +239,6 @@ struct st_context
|
|
/** This masks out unused shader resources. Only valid in draw calls. */
|
|
uint64_t active_states;
|
|
|
|
- unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */
|
|
-
|
|
/* If true, further analysis of states is required to know if something
|
|
* has changed. Used mainly for shaders.
|
|
*/
|
|
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
|
|
index 996d985510c..159d7017b07 100644
|
|
--- a/src/mesa/state_tracker/st_draw.c
|
|
+++ b/src/mesa/state_tracker/st_draw.c
|
|
@@ -124,26 +124,26 @@ prepare_draw(struct st_context *st, struct gl_context *ctx)
|
|
st_validate_state(st, ST_PIPELINE_RENDER);
|
|
}
|
|
|
|
- struct pipe_context *pipe = st->pipe;
|
|
-
|
|
/* Pin threads regularly to the same Zen CCX that the main thread is
|
|
* running on. The main thread can move between CCXs.
|
|
*/
|
|
- if (unlikely(/* AMD Zen */
|
|
- util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 &&
|
|
+ if (unlikely(st->pin_thread_counter != ST_L3_PINNING_DISABLED &&
|
|
/* no glthread */
|
|
ctx->CurrentClientDispatch != ctx->MarshalExec &&
|
|
- /* driver support */
|
|
- pipe->set_context_param &&
|
|
/* do it occasionally */
|
|
++st->pin_thread_counter % 512 == 0)) {
|
|
+ st->pin_thread_counter = 0;
|
|
+
|
|
int cpu = util_get_current_cpu();
|
|
if (cpu >= 0) {
|
|
- unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
|
|
-
|
|
- pipe->set_context_param(pipe,
|
|
- PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
|
|
- L3_cache);
|
|
+ struct pipe_context *pipe = st->pipe;
|
|
+ uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu];
|
|
+
|
|
+ if (L3_cache != U_CPU_INVALID_L3) {
|
|
+ pipe->set_context_param(pipe,
|
|
+ PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
|
|
+ L3_cache);
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
diff --git a/src/util/half_float.h b/src/util/half_float.h
|
|
index c52bccf8d1e..8f1a1dbf11d 100644
|
|
--- a/src/util/half_float.h
|
|
+++ b/src/util/half_float.h
|
|
@@ -59,7 +59,7 @@ static inline uint16_t
|
|
_mesa_float_to_half(float val)
|
|
{
|
|
#if defined(USE_X86_64_ASM)
|
|
- if (util_cpu_caps.has_f16c) {
|
|
+ if (util_get_cpu_caps()->has_f16c) {
|
|
__m128 in = {val};
|
|
__m128i out;
|
|
|
|
@@ -75,7 +75,7 @@ static inline float
|
|
_mesa_half_to_float(uint16_t val)
|
|
{
|
|
#if defined(USE_X86_64_ASM)
|
|
- if (util_cpu_caps.has_f16c) {
|
|
+ if (util_get_cpu_caps()->has_f16c) {
|
|
__m128i in = {val};
|
|
__m128 out;
|
|
|
|
@@ -90,7 +90,7 @@ static inline uint16_t
|
|
_mesa_float_to_float16_rtz(float val)
|
|
{
|
|
#if defined(USE_X86_64_ASM)
|
|
- if (util_cpu_caps.has_f16c) {
|
|
+ if (util_get_cpu_caps()->has_f16c) {
|
|
__m128 in = {val};
|
|
__m128i out;
|
|
|
|
diff --git a/src/util/tests/format/u_format_test.c b/src/util/tests/format/u_format_test.c
|
|
index f4a62a5c6a8..e6473c2bf6d 100644
|
|
--- a/src/util/tests/format/u_format_test.c
|
|
+++ b/src/util/tests/format/u_format_test.c
|
|
@@ -850,6 +850,8 @@ int main(int argc, char **argv)
|
|
{
|
|
boolean success;
|
|
|
|
+ util_cpu_detect();
|
|
+
|
|
success = test_all();
|
|
|
|
return success ? 0 : 1;
|
|
diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c
|
|
index 025f2f30156..4a4b06e1bc6 100644
|
|
--- a/src/util/u_cpu_detect.c
|
|
+++ b/src/util/u_cpu_detect.c
|
|
@@ -90,7 +90,7 @@
|
|
DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
|
|
|
|
|
|
-struct util_cpu_caps util_cpu_caps;
|
|
+struct util_cpu_caps_t util_cpu_caps;
|
|
|
|
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
|
|
static int has_cpuid(void);
|
|
@@ -438,26 +438,22 @@ get_cpu_topology(void)
|
|
util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus;
|
|
util_cpu_caps.num_L3_caches = 1;
|
|
|
|
+ memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3));
|
|
+
|
|
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
|
|
/* AMD Zen */
|
|
if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 &&
|
|
util_cpu_caps.family < CPU_AMD_LAST) {
|
|
uint32_t regs[4];
|
|
|
|
- /* Query the L3 cache count. */
|
|
- cpuid_count(0x8000001D, 3, regs);
|
|
- unsigned cache_level = (regs[0] >> 5) & 0x7;
|
|
- unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
|
|
-
|
|
- if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus)
|
|
- return;
|
|
-
|
|
uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
|
|
uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
|
|
- uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0};
|
|
- uint32_t apic_id[UTIL_MAX_CPUS];
|
|
bool saved = false;
|
|
|
|
+ uint32_t L3_found[UTIL_MAX_CPUS] = {0};
|
|
+ uint32_t num_L3_caches = 0;
|
|
+ util_affinity_mask *L3_affinity_masks = NULL;
|
|
+
|
|
/* Query APIC IDs from each CPU core.
|
|
*
|
|
* An APIC ID is a logical ID of the CPU with respect to the cache
|
|
@@ -482,41 +478,60 @@ get_cpu_topology(void)
|
|
|
|
if (util_set_current_thread_affinity(mask,
|
|
!saved ? saved_mask : NULL,
|
|
- UTIL_MAX_CPUS)) {
|
|
+ util_cpu_caps.num_cpu_mask_bits)) {
|
|
saved = true;
|
|
- allowed_mask[i / 32] |= cpu_bit;
|
|
|
|
/* Query the APIC ID of the current core. */
|
|
cpuid(0x00000001, regs);
|
|
- apic_id[i] = regs[1] >> 24;
|
|
+ unsigned apic_id = regs[1] >> 24;
|
|
+
|
|
+ /* Query the total core count for the CPU */
|
|
+ uint32_t core_count = 1;
|
|
+ if (regs[3] & (1 << 28))
|
|
+ core_count = (regs[1] >> 16) & 0xff;
|
|
+
|
|
+ core_count = util_next_power_of_two(core_count);
|
|
+
|
|
+ /* Query the L3 cache count. */
|
|
+ cpuid_count(0x8000001D, 3, regs);
|
|
+ unsigned cache_level = (regs[0] >> 5) & 0x7;
|
|
+ unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
|
|
+
|
|
+ if (cache_level != 3)
|
|
+ continue;
|
|
+
|
|
+ unsigned local_core_id = apic_id & (core_count - 1);
|
|
+ unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count);
|
|
+ unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3);
|
|
+#define L3_ID(p, i) (p << 16 | i << 1 | 1);
|
|
+
|
|
+ unsigned l3_id = L3_ID(phys_id, local_l3_cache_index);
|
|
+ int idx = -1;
|
|
+ for (unsigned c = 0; c < num_L3_caches; c++) {
|
|
+ if (L3_found[c] == l3_id) {
|
|
+ idx = c;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ if (idx == -1) {
|
|
+ idx = num_L3_caches;
|
|
+ L3_found[num_L3_caches++] = l3_id;
|
|
+ L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches);
|
|
+ if (!L3_affinity_masks)
|
|
+ return;
|
|
+ memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask));
|
|
+ }
|
|
+ util_cpu_caps.cpu_to_L3[i] = idx;
|
|
+ L3_affinity_masks[idx][i / 32] |= cpu_bit;
|
|
+
|
|
}
|
|
mask[i / 32] = 0;
|
|
}
|
|
|
|
- if (saved) {
|
|
-
|
|
- /* We succeeded in using at least one CPU. */
|
|
- util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3;
|
|
- util_cpu_caps.cores_per_L3 = cores_per_L3;
|
|
- util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask),
|
|
- util_cpu_caps.num_L3_caches);
|
|
-
|
|
- for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
|
|
- i++) {
|
|
- uint32_t cpu_bit = 1u << (i % 32);
|
|
-
|
|
- if (allowed_mask[i / 32] & cpu_bit) {
|
|
- /* Each APIC ID bit represents a topology level, so we need
|
|
- * to round up to the next power of two.
|
|
- */
|
|
- unsigned L3_index = apic_id[i] /
|
|
- util_next_power_of_two(cores_per_L3);
|
|
-
|
|
- util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit;
|
|
- util_cpu_caps.cpu_to_L3[i] = L3_index;
|
|
- }
|
|
- }
|
|
+ util_cpu_caps.num_L3_caches = num_L3_caches;
|
|
+ util_cpu_caps.L3_affinity_mask = L3_affinity_masks;
|
|
|
|
+ if (saved) {
|
|
if (debug_get_option_dump_cpu()) {
|
|
fprintf(stderr, "CPU <-> L3 cache mapping:\n");
|
|
for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
|
|
@@ -528,7 +543,8 @@ get_cpu_topology(void)
|
|
}
|
|
|
|
/* Restore the original affinity mask. */
|
|
- util_set_current_thread_affinity(saved_mask, NULL, UTIL_MAX_CPUS);
|
|
+ util_set_current_thread_affinity(saved_mask, NULL,
|
|
+ util_cpu_caps.num_cpu_mask_bits);
|
|
} else {
|
|
if (debug_get_option_dump_cpu())
|
|
fprintf(stderr, "Cannot set thread affinity for any thread.\n");
|
|
@@ -547,7 +563,7 @@ util_cpu_detect_once(void)
|
|
{
|
|
SYSTEM_INFO system_info;
|
|
GetSystemInfo(&system_info);
|
|
- util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors;
|
|
+ util_cpu_caps.nr_cpus = MAX2(1, system_info.dwNumberOfProcessors);
|
|
}
|
|
#elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
|
|
util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
|
|
@@ -569,6 +585,8 @@ util_cpu_detect_once(void)
|
|
util_cpu_caps.nr_cpus = 1;
|
|
#endif
|
|
|
|
+ util_cpu_caps.num_cpu_mask_bits = align(util_cpu_caps.nr_cpus, 32);
|
|
+
|
|
/* Make the fallback cacheline size nonzero so that it can be
|
|
* safely passed to align().
|
|
*/
|
|
diff --git a/src/util/u_cpu_detect.h b/src/util/u_cpu_detect.h
|
|
index a76fd912910..1c7239b2ec7 100644
|
|
--- a/src/util/u_cpu_detect.h
|
|
+++ b/src/util/u_cpu_detect.h
|
|
@@ -55,7 +55,7 @@ enum cpu_family {
|
|
|
|
typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32];
|
|
|
|
-struct util_cpu_caps {
|
|
+struct util_cpu_caps_t {
|
|
int nr_cpus;
|
|
enum cpu_family family;
|
|
|
|
@@ -98,14 +98,27 @@ struct util_cpu_caps {
|
|
|
|
unsigned num_L3_caches;
|
|
unsigned cores_per_L3;
|
|
+ unsigned num_cpu_mask_bits;
|
|
|
|
uint16_t cpu_to_L3[UTIL_MAX_CPUS];
|
|
/* Affinity masks for each L3 cache. */
|
|
util_affinity_mask *L3_affinity_mask;
|
|
};
|
|
|
|
-extern struct util_cpu_caps
|
|
-util_cpu_caps;
|
|
+#define U_CPU_INVALID_L3 0xffff
|
|
+
|
|
+static inline const struct util_cpu_caps_t *
|
|
+util_get_cpu_caps(void)
|
|
+{
|
|
+ extern struct util_cpu_caps_t util_cpu_caps;
|
|
+
|
|
+ /* If you hit this assert, it means that something is using the
|
|
+ * cpu-caps without having first called util_cpu_detect()
|
|
+ */
|
|
+ assert(util_cpu_caps.nr_cpus >= 1);
|
|
+
|
|
+ return &util_cpu_caps;
|
|
+}
|
|
|
|
void util_cpu_detect(void);
|
|
|
|
diff --git a/src/util/u_math.c b/src/util/u_math.c
|
|
index 9a8a9ecbbde..41e7f599eb0 100644
|
|
--- a/src/util/u_math.c
|
|
+++ b/src/util/u_math.c
|
|
@@ -92,7 +92,7 @@ util_fpstate_get(void)
|
|
unsigned mxcsr = 0;
|
|
|
|
#if defined(PIPE_ARCH_SSE)
|
|
- if (util_cpu_caps.has_sse) {
|
|
+ if (util_get_cpu_caps()->has_sse) {
|
|
mxcsr = _mm_getcsr();
|
|
}
|
|
#endif
|
|
@@ -110,10 +110,10 @@ unsigned
|
|
util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
|
|
{
|
|
#if defined(PIPE_ARCH_SSE)
|
|
- if (util_cpu_caps.has_sse) {
|
|
+ if (util_get_cpu_caps()->has_sse) {
|
|
/* Enable flush to zero mode */
|
|
current_mxcsr |= _MM_FLUSH_ZERO_MASK;
|
|
- if (util_cpu_caps.has_daz) {
|
|
+ if (util_get_cpu_caps()->has_daz) {
|
|
/* Enable denormals are zero mode */
|
|
current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
|
|
}
|
|
@@ -132,7 +132,7 @@ void
|
|
util_fpstate_set(unsigned mxcsr)
|
|
{
|
|
#if defined(PIPE_ARCH_SSE)
|
|
- if (util_cpu_caps.has_sse) {
|
|
+ if (util_get_cpu_caps()->has_sse) {
|
|
_mm_setcsr(mxcsr);
|
|
}
|
|
#endif
|
|
diff --git a/src/util/u_queue.c b/src/util/u_queue.c
|
|
index b11b297a45c..8f21f0667c6 100644
|
|
--- a/src/util/u_queue.c
|
|
+++ b/src/util/u_queue.c
|
|
@@ -27,7 +27,7 @@
|
|
#include "u_queue.h"
|
|
|
|
#include "c11/threads.h"
|
|
-
|
|
+#include "util/u_cpu_detect.h"
|
|
#include "util/os_time.h"
|
|
#include "util/u_string.h"
|
|
#include "util/u_thread.h"
|
|
@@ -258,7 +258,8 @@ util_queue_thread_func(void *input)
|
|
uint32_t mask[UTIL_MAX_CPUS / 32];
|
|
|
|
memset(mask, 0xff, sizeof(mask));
|
|
- util_set_current_thread_affinity(mask, NULL, UTIL_MAX_CPUS);
|
|
+ util_set_current_thread_affinity(mask, NULL,
|
|
+ util_get_cpu_caps()->num_cpu_mask_bits);
|
|
}
|
|
|
|
#if defined(__linux__)
|