diff --git a/0001-Initial-public-Mesa-SWR.patch b/0001-Initial-public-Mesa-SWR.patch deleted file mode 100644 index 528678e..0000000 --- a/0001-Initial-public-Mesa-SWR.patch +++ /dev/null @@ -1,6428 +0,0 @@ -From 293435cf5955935a6ce43bf59a6d743aad8be6d8 Mon Sep 17 00:00:00 2001 -From: Tim Rowley -Date: Mon, 19 Oct 2015 13:31:29 -0500 -Subject: [PATCH 1/3] Initial public Mesa+SWR - ---- - README.md | 33 + - configure.ac | 54 + - src/gallium/Makefile.am | 4 + - src/gallium/SConscript | 1 + - src/gallium/auxiliary/gallivm/lp_bld_flow.h | 7 + - src/gallium/auxiliary/gallivm/lp_bld_init.h | 7 + - src/gallium/auxiliary/gallivm/lp_bld_sample.h | 6 + - src/gallium/auxiliary/gallivm/lp_bld_tgsi.h | 8 + - .../auxiliary/target-helpers/inline_sw_helper.h | 13 +- - .../target-helpers/inline_wrapper_sw_helper.h | 2 +- - src/gallium/drivers/swr/.clang-format | 64 + - src/gallium/drivers/swr/Automake.inc | 28 + - src/gallium/drivers/swr/Makefile.am | 82 ++ - src/gallium/drivers/swr/Makefile.sources | 114 ++ - src/gallium/drivers/swr/SConscript | 69 + - src/gallium/drivers/swr/swr_clear.cpp | 141 ++ - src/gallium/drivers/swr/swr_context.cpp | 392 ++++++ - src/gallium/drivers/swr/swr_context.h | 172 +++ - src/gallium/drivers/swr/swr_context_llvm.h | 124 ++ - src/gallium/drivers/swr/swr_draw.cpp | 277 ++++ - src/gallium/drivers/swr/swr_fence.cpp | 141 ++ - src/gallium/drivers/swr/swr_fence.h | 73 ++ - src/gallium/drivers/swr/swr_memory.h | 99 ++ - src/gallium/drivers/swr/swr_public.h | 40 + - src/gallium/drivers/swr/swr_query.cpp | 334 +++++ - src/gallium/drivers/swr/swr_query.h | 48 + - src/gallium/drivers/swr/swr_resource.h | 98 ++ - src/gallium/drivers/swr/swr_scratch.cpp | 116 ++ - src/gallium/drivers/swr/swr_scratch.h | 63 + - src/gallium/drivers/swr/swr_screen.cpp | 666 ++++++++++ - src/gallium/drivers/swr/swr_screen.h | 52 + - src/gallium/drivers/swr/swr_shader.cpp | 608 +++++++++ - src/gallium/drivers/swr/swr_shader.h | 61 + - src/gallium/drivers/swr/swr_state.cpp | 1344 ++++++++++++++++++++ - src/gallium/drivers/swr/swr_state.h | 240 ++++ - src/gallium/drivers/swr/swr_tex_sample.cpp | 338 +++++ - src/gallium/drivers/swr/swr_tex_sample.h | 47 + - src/gallium/targets/libgl-xlib/Makefile.am | 5 + - src/gallium/targets/libgl-xlib/SConscript | 4 + - src/gallium/targets/osmesa/Makefile.am | 6 + - 40 files changed, 5979 insertions(+), 2 deletions(-) - create mode 100644 README.md - create mode 100644 src/gallium/drivers/swr/.clang-format - create mode 100644 src/gallium/drivers/swr/Automake.inc - create mode 100644 src/gallium/drivers/swr/Makefile.am - create mode 100644 src/gallium/drivers/swr/Makefile.sources - create mode 100644 src/gallium/drivers/swr/SConscript - create mode 100644 src/gallium/drivers/swr/swr_clear.cpp - create mode 100644 src/gallium/drivers/swr/swr_context.cpp - create mode 100644 src/gallium/drivers/swr/swr_context.h - create mode 100644 src/gallium/drivers/swr/swr_context_llvm.h - create mode 100644 src/gallium/drivers/swr/swr_draw.cpp - create mode 100644 src/gallium/drivers/swr/swr_fence.cpp - create mode 100644 src/gallium/drivers/swr/swr_fence.h - create mode 100644 src/gallium/drivers/swr/swr_memory.h - create mode 100644 src/gallium/drivers/swr/swr_public.h - create mode 100644 src/gallium/drivers/swr/swr_query.cpp - create mode 100644 src/gallium/drivers/swr/swr_query.h - create mode 100644 src/gallium/drivers/swr/swr_resource.h - create mode 100644 src/gallium/drivers/swr/swr_scratch.cpp - create mode 100644 src/gallium/drivers/swr/swr_scratch.h - create mode 100644 src/gallium/drivers/swr/swr_screen.cpp - create mode 100644 src/gallium/drivers/swr/swr_screen.h - create mode 100644 src/gallium/drivers/swr/swr_shader.cpp - create mode 100644 src/gallium/drivers/swr/swr_shader.h - create mode 100644 src/gallium/drivers/swr/swr_state.cpp - create mode 100644 src/gallium/drivers/swr/swr_state.h - create mode 100644 src/gallium/drivers/swr/swr_tex_sample.cpp - create mode 100644 src/gallium/drivers/swr/swr_tex_sample.h - -diff --git a/README.md b/README.md -new file mode 100644 -index 0000000..3bf3031 ---- /dev/null -+++ b/README.md -@@ -0,0 +1,33 @@ -+OpenSWR-Mesa -+============ -+ -+Overview -+-------- -+ -+This is repository of the integration work combining the high -+performance, highly scalable core SWR rasterizer with Mesa. A more -+complete introduction and discussion towards upstreaming to the Mesa -+project can be found on the mesa-dev mailing list. -+ -+Notes -+----- -+ -+* SWR is set as the default software renderer. Use -+GALLIUM_DRIVER=llvmpipe to switch to Mesa's standard rasterizer. This -+particular change is to make it easier for people evaluating OpenSWR, -+and will not be upstreamed. -+ -+* LLVM-3.6 is required. -+ -+* To build SWR with autoconf, include the following in the config -+line: "--with-gallium-drivers=swr --enable-swr-native". -+ -+* Build defaults to AVX2; for a version to run on AVX build with -+ "--with-swr-arch=AVX". -+ -+* To build SWR with SCons, nothing needs to be done - it is built by -+ default. -+ -+* Code for the driver is in src/gallium/drivers/swr -+ -+* Code for the rasterizer is in src/gallium/drivers/swr/rasterizer -diff --git a/configure.ac b/configure.ac -index d3df195..f216dc7 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -1753,6 +1753,11 @@ AC_SUBST([LLVM_LIBS]) - AC_SUBST([LLVM_LDFLAGS]) - AC_SUBST([LLVM_INCLUDEDIR]) - AC_SUBST([LLVM_VERSION]) -+AC_SUBST([SWR_LIBDIR]) -+AC_SUBST([SWR_ARCH]) -+AC_SUBST([SWR_ARCH_FLAG]) -+AC_SUBST([SWR_NATIVE]) -+AC_SUBST([SWR_INCLUDEDIR]) - AC_SUBST([CLANG_RESOURCE_DIR]) - - case "x$enable_opengl$enable_gles1$enable_gles2" in -@@ -2177,6 +2182,9 @@ if test -n "$with_gallium_drivers"; then - HAVE_GALLIUM_LLVMPIPE=yes - fi - ;; -+ xswr) -+ HAVE_GALLIUM_SWR=yes -+ ;; - xvc4) - HAVE_GALLIUM_VC4=yes - gallium_require_drm "vc4" -@@ -2243,6 +2251,41 @@ if test "x$MESA_LLVM" != x0; then - fi - fi - -+dnl SWR include/library -+ -+AC_ARG_WITH([swr-includedir], -+ [AS_HELP_STRING([--with-swr-includedir], [Path to SWR includes])], -+ [SWR_INCLUDEDIR="$withval"], -+ [SWR_INCLUDEDIR='']) -+ -+AC_ARG_WITH([swr-libdir], -+ [AS_HELP_STRING([--with-swr-libdir], [Path to SWR library])], -+ [SWR_LIBDIR="$withval"], -+ [SWR_LIBDIR='']) -+ -+AC_ARG_WITH([swr-arch], -+ [AS_HELP_STRING([--with-swr-arch], [AVX architecture for swr (AVX | CORE_AVX2) ])], -+ [SWR_ARCH="$withval"], -+ [SWR_ARCH="CORE-AVX2"]) -+ -+case "$SWR_ARCH" in -+"AVX") -+ SWR_ARCH_FLAG='-march=core-avx-i -DKNOB_ARCH=KNOB_ARCH_AVX ' -+ ;; -+"CORE-AVX2") -+ SWR_ARCH_FLAG='-march=core-avx2 -DKNOB_ARCH=KNOB_ARCH_AVX2 ' -+ ;; -+**) -+ SWR_ARCH_FLAG='-march=core-avx2 -DKNOB_ARCH=KNOB_ARCH_AVX2 ' -+esac -+ -+AC_ARG_ENABLE([swr-native], -+ [AS_HELP_STRING([--enable-swr-native], -+ [use in-tree version of SWR core @<:@default=disabled@:>@])], -+ [enable_swr_native="$enableval"], -+ [enable_swr_native=no] -+) -+ - AM_CONDITIONAL(HAVE_GALLIUM_SVGA, test "x$HAVE_GALLIUM_SVGA" = xyes) - AM_CONDITIONAL(HAVE_GALLIUM_I915, test "x$HAVE_GALLIUM_I915" = xyes) - AM_CONDITIONAL(HAVE_GALLIUM_ILO, test "x$HAVE_GALLIUM_ILO" = xyes) -@@ -2255,6 +2298,8 @@ AM_CONDITIONAL(HAVE_GALLIUM_NOUVEAU, test "x$HAVE_GALLIUM_NOUVEAU" = xyes) - AM_CONDITIONAL(HAVE_GALLIUM_FREEDRENO, test "x$HAVE_GALLIUM_FREEDRENO" = xyes) - AM_CONDITIONAL(HAVE_GALLIUM_SOFTPIPE, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes) - AM_CONDITIONAL(HAVE_GALLIUM_LLVMPIPE, test "x$HAVE_GALLIUM_LLVMPIPE" = xyes) -+AM_CONDITIONAL(HAVE_GALLIUM_SWR, test "x$HAVE_GALLIUM_SWR" = xyes) -+AM_CONDITIONAL(SWR_NATIVE, test "x$enable_swr_native" = xyes) - AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes) - - AM_CONDITIONAL(HAVE_GALLIUM_STATIC_TARGETS, test "x$enable_shared_pipe_drivers" = xno) -@@ -2374,6 +2419,7 @@ AC_CONFIG_FILES([Makefile - src/gallium/drivers/rbug/Makefile - src/gallium/drivers/softpipe/Makefile - src/gallium/drivers/svga/Makefile -+ src/gallium/drivers/swr/Makefile - src/gallium/drivers/trace/Makefile - src/gallium/drivers/vc4/Makefile - src/gallium/state_trackers/clover/Makefile -@@ -2562,6 +2608,14 @@ if test "x$MESA_LLVM" = x1; then - echo " LLVM_LDFLAGS: $LLVM_LDFLAGS" - echo "" - fi -+if test "x$HAVE_GALLIUM_SWR" = xyes; then -+ echo " SWR_INCLUDEDIR: $SWR_INCLUDEDIR" -+ echo " SWR_LIBDIR: $SWR_LIBDIR" -+ echo " SWR_ARCH: $SWR_ARCH" -+ echo " SWR_ARCH_FLAG: $SWR_ARCH_FLAG" -+ echo " SWR_NATIVE: $enable_swr_native" -+ echo "" -+fi - echo " PYTHON2: $PYTHON2" - - echo "" -diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am -index a7c3606..dcce6a3 100644 ---- a/src/gallium/Makefile.am -+++ b/src/gallium/Makefile.am -@@ -77,6 +77,10 @@ SUBDIRS += drivers/llvmpipe - endif - endif - -+if HAVE_GALLIUM_SWR -+SUBDIRS += drivers/swr -+endif -+ - ## vc4/rpi - if HAVE_GALLIUM_VC4 - SUBDIRS += drivers/vc4 winsys/vc4/drm -diff --git a/src/gallium/SConscript b/src/gallium/SConscript -index fa5fa6e..766c24a 100644 ---- a/src/gallium/SConscript -+++ b/src/gallium/SConscript -@@ -17,6 +17,7 @@ SConscript([ - 'drivers/softpipe/SConscript', - 'drivers/svga/SConscript', - 'drivers/trace/SConscript', -+ 'drivers/swr/SConscript', - ]) - - # -diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h -index 0da849b..083b0ad 100644 ---- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h -+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h -@@ -37,6 +37,9 @@ - - #include "gallivm/lp_bld.h" - -+#ifdef __cplusplus -+extern "C" { -+#endif - - struct lp_type; - -@@ -198,4 +201,8 @@ lp_build_array_alloca(struct gallivm_state *gallivm, - LLVMValueRef count, - const char *name); - -+#ifdef __cplusplus -+} -+#endif -+ - #endif /* !LP_BLD_FLOW_H */ -diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h -index 9e50f88..ab44661 100644 ---- a/src/gallium/auxiliary/gallivm/lp_bld_init.h -+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h -@@ -35,6 +35,9 @@ - #include "lp_bld.h" - #include - -+#ifdef __cplusplus -+extern "C" { -+#endif - - struct gallivm_state - { -@@ -82,4 +85,8 @@ void - lp_set_store_alignment(LLVMValueRef Inst, - unsigned Align); - -+#ifdef __cplusplus -+} -+#endif -+ - #endif /* !LP_BLD_INIT_H */ -diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h -index eba758d..5f53c47 100644 ---- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h -+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h -@@ -42,6 +42,9 @@ - #include "gallivm/lp_bld_type.h" - #include "gallivm/lp_bld_swizzle.h" - -+#ifdef __cplusplus -+extern "C" { -+#endif - - struct pipe_resource; - struct pipe_sampler_view; -@@ -612,5 +615,8 @@ lp_build_minify(struct lp_build_context *bld, - LLVMValueRef level, - boolean lod_scalar); - -+#ifdef __cplusplus -+} -+#endif - - #endif /* LP_BLD_SAMPLE_H */ -diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h -index 2ca9c61..189d03d 100644 ---- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h -+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h -@@ -48,6 +48,10 @@ - #include "tgsi/tgsi_scan.h" - #include "tgsi/tgsi_info.h" - -+#ifdef __cplusplus -+extern "C" { -+#endif -+ - #define LP_CHAN_ALL ~0 - - #define LP_MAX_INSTRUCTIONS 256 -@@ -661,4 +665,8 @@ lp_build_tgsi_llvm( - struct lp_build_tgsi_context * bld_base, - const struct tgsi_token *tokens); - -+#ifdef __cplusplus -+} -+#endif -+ - #endif /* LP_BLD_TGSI_H */ -diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h -index 5f46552..e67dd17 100644 ---- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h -+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h -@@ -19,6 +19,10 @@ - #include "llvmpipe/lp_public.h" - #endif - -+#ifdef GALLIUM_SWR -+#include "swr/swr_public.h" -+#endif -+ - - static inline struct pipe_screen * - sw_screen_create_named(struct sw_winsys *winsys, const char *driver) -@@ -30,6 +34,11 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver) - screen = llvmpipe_create_screen(winsys); - #endif - -+#if defined(GALLIUM_SWR) -+ if (screen == NULL && strcmp(driver, "swr") == 0) -+ screen = swr_create_screen(winsys); -+#endif -+ - #if defined(GALLIUM_SOFTPIPE) - if (screen == NULL) - screen = softpipe_create_screen(winsys); -@@ -45,7 +54,9 @@ sw_screen_create(struct sw_winsys *winsys) - const char *default_driver; - const char *driver; - --#if defined(GALLIUM_LLVMPIPE) -+#if defined(GALLIUM_SWR) -+ default_driver = "swr"; -+#elif defined(GALLIUM_LLVMPIPE) - default_driver = "llvmpipe"; - #elif defined(GALLIUM_SOFTPIPE) - default_driver = "softpipe"; -diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h -index 4f38ba9..d707b8b 100644 ---- a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h -+++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h -@@ -12,7 +12,7 @@ - static inline struct pipe_screen * - sw_screen_wrap(struct pipe_screen *screen) - { --#if defined(GALLIUM_SOFTPIPE) || defined(GALLIUM_LLVMPIPE) -+#if defined(GALLIUM_SOFTPIPE) || defined(GALLIUM_LLVMPIPE) || defined(GALLIUM_SWR) - struct sw_winsys *sws; - struct pipe_screen *sw_screen = NULL; - const char *driver; -diff --git a/src/gallium/drivers/swr/.clang-format b/src/gallium/drivers/swr/.clang-format -new file mode 100644 -index 0000000..0ec65a5 ---- /dev/null -+++ b/src/gallium/drivers/swr/.clang-format -@@ -0,0 +1,64 @@ -+--- -+Language: Cpp -+AccessModifierOffset: -3 -+AlignAfterOpenBracket: true -+AlignEscapedNewlinesLeft: false -+AlignOperands: false -+AlignTrailingComments: false -+AllowAllParametersOfDeclarationOnNextLine: true -+AllowShortBlocksOnASingleLine: false -+AllowShortCaseLabelsOnASingleLine: false -+AllowShortIfStatementsOnASingleLine: false -+AllowShortLoopsOnASingleLine: false -+AllowShortFunctionsOnASingleLine: All -+AlwaysBreakAfterDefinitionReturnType: true -+AlwaysBreakTemplateDeclarations: false -+AlwaysBreakBeforeMultilineStrings: false -+BreakBeforeBinaryOperators: NonAssignment -+BreakBeforeTernaryOperators: true -+BreakConstructorInitializersBeforeComma: true -+BinPackParameters: false -+BinPackArguments: false -+ColumnLimit: 78 -+ConstructorInitializerAllOnOneLineOrOnePerLine: false -+ConstructorInitializerIndentWidth: 3 -+DerivePointerAlignment: false -+ExperimentalAutoDetectBinPacking: false -+IndentCaseLabels: false -+IndentWrappedFunctionNames: false -+IndentFunctionDeclarationAfterType: false -+MaxEmptyLinesToKeep: 2 -+KeepEmptyLinesAtTheStartOfBlocks: true -+NamespaceIndentation: Inner -+ObjCBlockIndentWidth: 3 -+ObjCSpaceAfterProperty: true -+ObjCSpaceBeforeProtocolList: true -+PenaltyBreakBeforeFirstCallParameter: 19 -+PenaltyBreakComment: 300 -+PenaltyBreakString: 1000 -+PenaltyBreakFirstLessLess: 120 -+PenaltyExcessCharacter: 1000000 -+PenaltyReturnTypeOnItsOwnLine: 0 -+PointerAlignment: Right -+SpacesBeforeTrailingComments: 1 -+Cpp11BracedListStyle: true -+Standard: Cpp11 -+IndentWidth: 3 -+TabWidth: 8 -+UseTab: Never -+BreakBeforeBraces: Linux -+SpacesInParentheses: false -+SpacesInSquareBrackets: false -+SpacesInAngles: false -+SpaceInEmptyParentheses: false -+SpacesInCStyleCastParentheses: false -+SpaceAfterCStyleCast: false -+SpacesInContainerLiterals: true -+SpaceBeforeAssignmentOperators: true -+ContinuationIndentWidth: 3 -+CommentPragmas: '^ IWYU pragma:' -+ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] -+SpaceBeforeParens: ControlStatements -+DisableFormat: false -+... -+ -diff --git a/src/gallium/drivers/swr/Automake.inc b/src/gallium/drivers/swr/Automake.inc -new file mode 100644 -index 0000000..8e66744 ---- /dev/null -+++ b/src/gallium/drivers/swr/Automake.inc -@@ -0,0 +1,28 @@ -+# Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+# -+# Permission is hereby granted, free of charge, to any person obtaining a -+# copy of this software and associated documentation files (the "Software"), -+# to deal in the Software without restriction, including without limitation -+# the rights to use, copy, modify, merge, publish, distribute, sublicense, -+# and/or sell copies of the Software, and to permit persons to whom the -+# Software is furnished to do so, subject to the following conditions: -+# -+# The above copyright notice and this permission notice (including the next -+# paragraph) shall be included in all copies or substantial portions of the -+# Software. -+# -+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+# IN THE SOFTWARE. -+ -+if HAVE_GALLIUM_SWR -+ -+TARGET_CPPFLAGS += -DGALLIUM_SWR -+TARGET_LIB_DEPS += \ -+ $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la -+ -+endif -diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am -new file mode 100644 -index 0000000..5dff02c ---- /dev/null -+++ b/src/gallium/drivers/swr/Makefile.am -@@ -0,0 +1,82 @@ -+# Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+# -+# Permission is hereby granted, free of charge, to any person obtaining a -+# copy of this software and associated documentation files (the "Software"), -+# to deal in the Software without restriction, including without limitation -+# the rights to use, copy, modify, merge, publish, distribute, sublicense, -+# and/or sell copies of the Software, and to permit persons to whom the -+# Software is furnished to do so, subject to the following conditions: -+# -+# The above copyright notice and this permission notice (including the next -+# paragraph) shall be included in all copies or substantial portions of the -+# Software. -+# -+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+# IN THE SOFTWARE. -+ -+AUTOMAKE_OPTIONS = subdir-objects -+ -+include Makefile.sources -+include $(top_srcdir)/src/gallium/Automake.inc -+ -+AM_CXXFLAGS = \ -+ $(GALLIUM_DRIVER_CFLAGS) \ -+ -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS \ -+ $(SWR_ARCH_FLAG) \ -+ $(LLVM_CFLAGS) -+ -+noinst_LTLIBRARIES = libmesaswr.la -+ -+libmesaswr_la_SOURCES = $(CXX_SOURCES) -+ -+libmesaswr_la_LDFLAGS = -+ -+if SWR_NATIVE -+BUILT_SOURCES = \ -+ rasterizer/scripts/gen_knobs.cpp \ -+ rasterizer/scripts/gen_knobs.h \ -+ rasterizer/jitter/state_llvm.h -+ -+rasterizer/scripts/gen_knobs.cpp rasterizer/scripts/gen_knobs.h: rasterizer/scripts/gen_knobs.py rasterizer/scripts/knob_defs.py rasterizer/scripts/templates/knobs.template -+ $(PYTHON2) $(PYTHON_FLAGS) \ -+ $(srcdir)/rasterizer/scripts/gen_knobs.py \ -+ rasterizer/scripts -+ -+rasterizer/jitter/state_llvm.h: rasterizer/jitter/scripts/gen_llvm_types.py rasterizer/core/state.h -+ $(PYTHON2) $(PYTHON_FLAGS) \ -+ $(srcdir)/rasterizer/jitter/scripts/gen_llvm_types.py \ -+ --input $(srcdir)/rasterizer/core/state.h \ -+ --output rasterizer/jitter/state_llvm.h -+ -+libmesaswr_la_SOURCES += \ -+ $(COMMON_CXX_SOURCES) \ -+ $(CORE_CXX_SOURCES) \ -+ $(JITTER_CXX_SOURCES) \ -+ $(MEMORY_CXX_SOURCES) \ -+ rasterizer/scripts/gen_knobs.cpp \ -+ rasterizer/scripts/gen_knobs.h -+AM_CXXFLAGS += \ -+ -I$(srcdir)/rasterizer \ -+ -I$(srcdir)/rasterizer/core \ -+ -I$(srcdir)/rasterizer/jitter \ -+ -I$(builddir)/rasterizer/scripts \ -+ -I$(builddir)/rasterizer/jitter -+else -+libmesaswr_la_LDFLAGS += -L$(SWR_LIBDIR) -lSWR -+AM_CXXFLAGS += \ -+ -I$(SWR_INCLUDEDIR) \ -+ -I$(SWR_INCLUDEDIR)/core \ -+ -I$(SWR_INCLUDEDIR)/jitter \ -+ -I$(SWR_INCLUDEDIR)/build/jitter \ -+ -I$(SWR_INCLUDEDIR)/build/scripts -+endif -+ -+libmesaswr_la_LDFLAGS += -lnuma -+ -+ -+EXTRA_DIST = SConscript -diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources -new file mode 100644 -index 0000000..1c6fe08 ---- /dev/null -+++ b/src/gallium/drivers/swr/Makefile.sources -@@ -0,0 +1,114 @@ -+# Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+# -+# Permission is hereby granted, free of charge, to any person obtaining a -+# copy of this software and associated documentation files (the "Software"), -+# to deal in the Software without restriction, including without limitation -+# the rights to use, copy, modify, merge, publish, distribute, sublicense, -+# and/or sell copies of the Software, and to permit persons to whom the -+# Software is furnished to do so, subject to the following conditions: -+# -+# The above copyright notice and this permission notice (including the next -+# paragraph) shall be included in all copies or substantial portions of the -+# Software. -+# -+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+# IN THE SOFTWARE. -+ -+CXX_SOURCES := \ -+ swr_clear.cpp \ -+ swr_context.cpp \ -+ swr_context.h \ -+ swr_context_llvm.h \ -+ swr_draw.cpp \ -+ swr_public.h \ -+ swr_resource.h \ -+ swr_screen.cpp \ -+ swr_screen.h \ -+ swr_state.cpp \ -+ swr_state.h \ -+ swr_tex_sample.cpp \ -+ swr_tex_sample.h \ -+ swr_scratch.h \ -+ swr_scratch.cpp \ -+ swr_shader.cpp \ -+ swr_memory.h \ -+ swr_fence.h \ -+ swr_fence.cpp \ -+ swr_query.h \ -+ swr_query.cpp -+ -+COMMON_CXX_SOURCES := \ -+ rasterizer/common/containers.hpp \ -+ rasterizer/common/formats.cpp \ -+ rasterizer/common/formats.h \ -+ rasterizer/common/isa.hpp \ -+ rasterizer/common/os.h \ -+ rasterizer/common/rdtsc_buckets.cpp \ -+ rasterizer/common/rdtsc_buckets.h \ -+ rasterizer/common/rdtsc_buckets_shared.h \ -+ rasterizer/common/rdtsc_buckets_shared.h \ -+ rasterizer/common/simdintrin.h \ -+ rasterizer/common/swr_assert.cpp \ -+ rasterizer/common/swr_assert.h -+ -+CORE_CXX_SOURCES := \ -+ rasterizer/core/api.cpp \ -+ rasterizer/core/api.h \ -+ rasterizer/core/arena.cpp \ -+ rasterizer/core/arena.h \ -+ rasterizer/core/backend.cpp \ -+ rasterizer/core/backend.h \ -+ rasterizer/core/blend.h \ -+ rasterizer/core/clip.cpp \ -+ rasterizer/core/clip.h \ -+ rasterizer/core/context.h \ -+ rasterizer/core/depthstencil.h \ -+ rasterizer/core/fifo.hpp \ -+ rasterizer/core/format_traits.h \ -+ rasterizer/core/format_types.h \ -+ rasterizer/core/frontend.cpp \ -+ rasterizer/core/frontend.h \ -+ rasterizer/core/knobs.h \ -+ rasterizer/core/knobs_init.h \ -+ rasterizer/core/multisample.h \ -+ rasterizer/core/pa_avx.cpp \ -+ rasterizer/core/pa.h \ -+ rasterizer/core/rasterizer.cpp \ -+ rasterizer/core/rasterizer.h \ -+ rasterizer/core/rdtsc_core.cpp \ -+ rasterizer/core/rdtsc_core.h \ -+ rasterizer/core/state.h \ -+ rasterizer/core/threads.cpp \ -+ rasterizer/core/threads.h \ -+ rasterizer/core/tilemgr.cpp \ -+ rasterizer/core/tilemgr.h \ -+ rasterizer/core/utils.cpp \ -+ rasterizer/core/utils.h -+ -+JITTER_CXX_SOURCES := \ -+ rasterizer/jitter/blend_jit.cpp \ -+ rasterizer/jitter/blend_jit.h \ -+ rasterizer/jitter/builder.cpp \ -+ rasterizer/jitter/builder_gen.cpp \ -+ rasterizer/jitter/builder_gen.h \ -+ rasterizer/jitter/builder.h \ -+ rasterizer/jitter/builder_misc.cpp \ -+ rasterizer/jitter/builder_misc.h \ -+ rasterizer/jitter/builder_x86.cpp \ -+ rasterizer/jitter/builder_x86.h \ -+ rasterizer/jitter/fetch_jit.cpp \ -+ rasterizer/jitter/fetch_jit.h \ -+ rasterizer/jitter/JitManager.cpp \ -+ rasterizer/jitter/JitManager.h \ -+ rasterizer/jitter/streamout_jit.cpp \ -+ rasterizer/jitter/streamout_jit.h -+ -+MEMORY_CXX_SOURCES := \ -+ rasterizer/memory/ClearTile.cpp \ -+ rasterizer/memory/LoadTile.cpp \ -+ rasterizer/memory/StoreTile.cpp -diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript -new file mode 100644 -index 0000000..4c8c121 ---- /dev/null -+++ b/src/gallium/drivers/swr/SConscript -@@ -0,0 +1,69 @@ -+from sys import executable as python_cmd -+import distutils.version -+ -+Import('*') -+ -+if not env['llvm']: -+ print 'warning: LLVM disabled: not building swr' -+ Return() -+ -+env = env.Clone() -+ -+env.MSVC2008Compat() -+ -+env.Append(CPPDEFINES = [ -+ '__STDC_CONSTANT_MACROS', -+ '__STDC_LIMIT_MACROS', -+ 'KNOB_ARCH=KNOB_ARCH_AVX2', -+ ]) -+ -+env.Append(CCFLAGS = [ -+ '-std=c++11', -+ '-march=core-avx2', -+ ]) -+ -+env.Prepend(CPPPATH = [ -+ 'rasterizer', -+ 'rasterizer/core', -+ 'rasterizer/jitter', -+ 'rasterizer/scripts', -+ ]) -+ -+gen_knobs = env.CodeGenerate( -+ target = 'rasterizer/scripts/gen_knobs.cpp', -+ script = 'rasterizer/scripts/gen_knobs.py', -+ source = [], -+ command = python_cmd + ' $SCRIPT ' + Dir('rasterizer/scripts').abspath -+) -+ -+gen_knobs = env.CodeGenerate( -+ target = 'rasterizer/scripts/gen_knobs.h', -+ script = 'rasterizer/scripts/gen_knobs.py', -+ source = [], -+ command = python_cmd + ' $SCRIPT ' + Dir('rasterizer/scripts').abspath -+) -+ -+state_llvm = env.CodeGenerate( -+ target = 'rasterizer/jitter/state_llvm.h', -+ script = 'rasterizer/jitter/scripts/gen_llvm_types.py', -+ source = 'rasterizer/core/state.h', -+ command = python_cmd + ' $SCRIPT --input $SOURCE --output $TARGET' -+) -+ -+source = ['rasterizer/scripts/gen_knobs.cpp', 'rasterizer/scripts/gen_knobs.h'] -+source += env.ParseSourceList('Makefile.sources', [ -+ 'CXX_SOURCES', -+ 'COMMON_CXX_SOURCES', -+ 'CORE_CXX_SOURCES', -+ 'JITTER_CXX_SOURCES', -+ 'MEMORY_CXX_SOURCES' -+]) -+ -+swr = env.ConvenienceLibrary( -+ target = 'swr', -+ source = source, -+ ) -+ -+env.Alias('swr', swr) -+ -+Export('swr') -diff --git a/src/gallium/drivers/swr/swr_clear.cpp b/src/gallium/drivers/swr/swr_clear.cpp -new file mode 100644 -index 0000000..7704359 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_clear.cpp -@@ -0,0 +1,141 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#include "swr_context.h" -+#include "swr_query.h" -+ -+static void -+swr_clear(struct pipe_context *pipe, -+ unsigned buffers, -+ const union pipe_color_union *color, -+ double depth, -+ unsigned stencil) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ struct pipe_framebuffer_state *fb = &ctx->framebuffer; -+ -+ UINT clearMask = 0; -+ -+ if (!swr_check_render_cond(pipe)) -+ return; -+ -+ if (ctx->dirty) -+ swr_update_derived(ctx); -+ -+/* Update clearMask/targetMask */ -+#if 0 /* XXX SWR currently only clears SWR_ATTACHMENT_COLOR0, don't bother \ -+ checking others yet. */ -+ if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) { -+ UINT i; -+ for (i = 0; i < fb->nr_cbufs; ++i) -+ if (fb->cbufs[i]) -+ clearMask |= (SWR_CLEAR_COLOR0 << i); -+ } -+#else -+ if (buffers & PIPE_CLEAR_COLOR && fb->cbufs[0]) -+ clearMask |= SWR_CLEAR_COLOR; -+#endif -+ -+ if (buffers & PIPE_CLEAR_DEPTH && fb->zsbuf) -+ clearMask |= SWR_CLEAR_DEPTH; -+ -+ if (buffers & PIPE_CLEAR_STENCIL && fb->zsbuf) -+ clearMask |= SWR_CLEAR_STENCIL; -+ -+#if 0 // XXX HACK, override clear color alpha. On ubuntu, clears are -+ // transparent. -+ ((union pipe_color_union *)color)->f[3] = 1.0; /* cast off your const'd-ness */ -+#endif -+ -+ /* Reset viewport to full framebuffer width/height before clear, then -+ * restore it */ -+ /* Scissor affects clear, viewport should not */ -+ ctx->dirty |= SWR_NEW_VIEWPORT; -+ SWR_VIEWPORT vp = {0}; -+ vp.width = ctx->framebuffer.width; -+ vp.height = ctx->framebuffer.height; -+ SwrSetViewports(ctx->swrContext, 1, &vp, NULL); -+ -+ SwrClearRenderTarget(ctx->swrContext, clearMask, color->f, depth, stencil); -+} -+ -+ -+#if 0 // XXX, these don't get called. how to get these called? Do we need -+ // them? Docs? -+static void -+swr_clear_render_target(struct pipe_context *pipe, struct pipe_surface *ps, -+ const union pipe_color_union *color, -+ unsigned x, unsigned y, unsigned w, unsigned h) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ fprintf(stderr, "SWR swr_clear_render_target!\n"); -+ -+ ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR; -+} -+ -+static void -+swr_clear_depth_stencil(struct pipe_context *pipe, struct pipe_surface *ps, -+ unsigned buffers, double depth, unsigned stencil, -+ unsigned x, unsigned y, unsigned w, unsigned h) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ fprintf(stderr, "SWR swr_clear_depth_stencil!\n"); -+ -+ ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR; -+} -+ -+static void -+swr_clear_buffer(struct pipe_context *pipe, -+ struct pipe_resource *res, -+ unsigned offset, unsigned size, -+ const void *data, int data_size) -+{ -+ fprintf(stderr, "SWR swr_clear_buffer!\n"); -+ struct swr_context *ctx = swr_context(pipe); -+ struct swr_resource *buf = swr_resource(res); -+ union pipe_color_union color; -+ enum pipe_format dst_fmt; -+ unsigned width, height, elements; -+ -+ assert(res->target == PIPE_BUFFER); -+ assert(buf); -+ assert(size % data_size == 0); -+ -+ SWR_SURFACE_STATE &swr_buffer = buf->swr; -+ -+ ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR; -+} -+#endif -+ -+ -+void -+swr_clear_init(struct pipe_context *pipe) -+{ -+ pipe->clear = swr_clear; -+#if 0 // XXX, these don't get called. how to get these called? Do we need -+ // them? Docs? -+ pipe->clear_render_target = swr_clear_render_target; -+ pipe->clear_depth_stencil = swr_clear_depth_stencil; -+ pipe->clear_buffer = swr_clear_buffer; -+#endif -+} -diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp -new file mode 100644 -index 0000000..6269cd0 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_context.cpp -@@ -0,0 +1,392 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#include "util/u_memory.h" -+#include "util/u_inlines.h" -+#include "util/u_format.h" -+ -+extern "C" { -+#include "util/u_transfer.h" -+#include "util/u_surface.h" -+} -+ -+#include "swr_context.h" -+#include "swr_memory.h" -+#include "swr_screen.h" -+#include "swr_resource.h" -+#include "swr_scratch.h" -+#include "swr_query.h" -+ -+#include "api.h" -+ -+static struct pipe_surface * -+swr_create_surface(struct pipe_context *pipe, -+ struct pipe_resource *pt, -+ const struct pipe_surface *surf_tmpl) -+{ -+ struct pipe_surface *ps; -+ -+ ps = CALLOC_STRUCT(pipe_surface); -+ if (ps) { -+ pipe_reference_init(&ps->reference, 1); -+ pipe_resource_reference(&ps->texture, pt); -+ ps->context = pipe; -+ ps->format = surf_tmpl->format; -+ if (pt->target != PIPE_BUFFER) { -+ assert(surf_tmpl->u.tex.level <= pt->last_level); -+ ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level); -+ ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level); -+ ps->u.tex.level = surf_tmpl->u.tex.level; -+ ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer; -+ ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer; -+ if (ps->u.tex.first_layer != ps->u.tex.last_layer) { -+ debug_printf("creating surface with multiple layers, rendering " -+ "to first layer only\n"); -+ } -+ } else { -+ /* setting width as number of elements should get us correct -+ * renderbuffer width */ -+ ps->width = surf_tmpl->u.buf.last_element -+ - surf_tmpl->u.buf.first_element + 1; -+ ps->height = pt->height0; -+ ps->u.buf.first_element = surf_tmpl->u.buf.first_element; -+ ps->u.buf.last_element = surf_tmpl->u.buf.last_element; -+ assert(ps->u.buf.first_element <= ps->u.buf.last_element); -+ assert(ps->u.buf.last_element < ps->width); -+ } -+ } -+ return ps; -+} -+ -+static void -+swr_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surf) -+{ -+ assert(surf->texture); -+ struct pipe_resource *resource = surf->texture; -+ -+ /* If the surface being destroyed is a current render target, -+ * call StoreTiles to resolve the hotTile state then set attachment -+ * to NULL. -+ */ -+ if (resource->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL -+ | PIPE_BIND_DISPLAY_TARGET)) { -+ struct swr_context *ctx = swr_context(pipe); -+ struct swr_resource *spr = swr_resource(resource); -+ for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++) -+ if (ctx->current.attachment[i] == &spr->swr) { -+ swr_store_render_target(ctx, i, SWR_TILE_RESOLVED); -+ ctx->current.attachment[i] = nullptr; -+ /* -+ * Mesa thinks depth/stencil are fused, so we'll never get an -+ * explicit resource for stencil. So, if checking depth, then -+ * also -+ * check for stencil. -+ */ -+ if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH)) { -+ swr_store_render_target( -+ ctx, SWR_ATTACHMENT_STENCIL, SWR_TILE_RESOLVED); -+ ctx->current.attachment[SWR_ATTACHMENT_STENCIL] = nullptr; -+ } -+ -+ SwrWaitForIdle(ctx->swrContext); -+ break; -+ } -+ } -+ -+ pipe_resource_reference(&surf->texture, NULL); -+ FREE(surf); -+} -+ -+ -+static void * -+swr_transfer_map(struct pipe_context *pipe, -+ struct pipe_resource *resource, -+ unsigned level, -+ unsigned usage, -+ const struct pipe_box *box, -+ struct pipe_transfer **transfer) -+{ -+ struct swr_resource *spr = swr_resource(resource); -+ struct pipe_transfer *pt; -+ enum pipe_format format = resource->format; -+ -+ assert(resource); -+ assert(level <= resource->last_level); -+ -+ /* -+ * If mapping any attached rendertarget, store tiles and wait for idle -+ * before giving CPU access to the surface. -+ * (set postStoreTileState to SWR_TILE_INVALID so tiles are reloaded) -+ */ -+ if (resource->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL -+ | PIPE_BIND_DISPLAY_TARGET)) { -+ struct swr_context *ctx = swr_context(pipe); -+ for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++) -+ if (ctx->current.attachment[i] == &spr->swr) { -+ swr_store_render_target(ctx, i, SWR_TILE_INVALID); -+ /* -+ * Mesa thinks depth/stencil are fused, so we'll never get an -+ * explicit map for stencil. So, if mapping depth, then also -+ * store tile for stencil. -+ */ -+ if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH)) -+ swr_store_render_target( -+ ctx, SWR_ATTACHMENT_STENCIL, SWR_TILE_INVALID); -+ SwrWaitForIdle(ctx->swrContext); -+ break; -+ } -+ } -+ -+ -+ pt = CALLOC_STRUCT(pipe_transfer); -+ if (!pt) -+ return NULL; -+ pipe_resource_reference(&pt->resource, resource); -+ pt->level = level; -+ pt->box = *box; -+ pt->stride = spr->row_stride[level]; -+ pt->layer_stride = spr->img_stride[level]; -+ -+ /* if we're mapping the depth/stencil, copy in stencil */ -+ if (spr->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT -+ && spr->has_stencil) { -+ for (unsigned i = 0; i < spr->alignedWidth * spr->alignedHeight; i++) { -+ spr->swr.pBaseAddress[4 * i + 3] = spr->secondary.pBaseAddress[i]; -+ } -+ } -+ -+ unsigned offset = box->z * pt->layer_stride + box->y * pt->stride -+ + box->x * util_format_get_blocksize(format); -+ -+ *transfer = pt; -+ -+ return spr->swr.pBaseAddress + offset + spr->mip_offsets[level]; -+} -+ -+static void -+swr_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *transfer) -+{ -+ assert(transfer->resource); -+ -+ /* -+ * XXX TODO: use fences and come up with a real resource manager. -+ * -+ * If this resource has been mapped/unmapped, it's probably in use. Tag it -+ *with this context so -+ * we'll know to check dependencies when it's deleted. -+ */ -+ struct swr_resource *res = swr_resource(transfer->resource); -+ res->bound_to_context = (void *)pipe; -+ -+ /* if we're mapping the depth/stencil, copy out stencil */ -+ if (res->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT -+ && res->has_stencil) { -+ for (unsigned i = 0; i < res->alignedWidth * res->alignedHeight; i++) { -+ res->secondary.pBaseAddress[i] = res->swr.pBaseAddress[4 * i + 3]; -+ } -+ } -+ -+ pipe_resource_reference(&transfer->resource, NULL); -+ FREE(transfer); -+} -+ -+ -+static void -+swr_resource_copy(struct pipe_context *pipe, -+ struct pipe_resource *dst, -+ unsigned dst_level, -+ unsigned dstx, -+ unsigned dsty, -+ unsigned dstz, -+ struct pipe_resource *src, -+ unsigned src_level, -+ const struct pipe_box *src_box) -+{ -+ if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) -+ || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) { -+ util_resource_copy_region( -+ pipe, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box); -+ return; -+ } -+ -+ debug_printf("unhandled swr_resource_copy\n"); -+} -+ -+ -+static void -+swr_blit(struct pipe_context *pipe, const struct pipe_blit_info *blit_info) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ struct pipe_blit_info info = *blit_info; -+ -+ if (blit_info->render_condition_enable && !swr_check_render_cond(pipe)) -+ return; -+ -+ if (info.src.resource->nr_samples > 1 && info.dst.resource->nr_samples <= 1 -+ && !util_format_is_depth_or_stencil(info.src.resource->format) -+ && !util_format_is_pure_integer(info.src.resource->format)) { -+ debug_printf("swr: color resolve unimplemented\n"); -+ return; -+ } -+ -+ if (util_try_blit_via_copy_region(pipe, &info)) { -+ return; /* done */ -+ } -+ -+ if (info.mask & PIPE_MASK_S) { -+ debug_printf("swr: cannot blit stencil, skipping\n"); -+ info.mask &= ~PIPE_MASK_S; -+ } -+ -+ if (!util_blitter_is_blit_supported(ctx->blitter, &info)) { -+ debug_printf("swr: blit unsupported %s -> %s\n", -+ util_format_short_name(info.src.resource->format), -+ util_format_short_name(info.dst.resource->format)); -+ return; -+ } -+ -+ /* XXX turn off occlusion and streamout queries */ -+ -+ util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vertex_buffer); -+ util_blitter_save_vertex_elements(ctx->blitter, (void *)ctx->velems); -+ util_blitter_save_vertex_shader(ctx->blitter, (void *)ctx->vs); -+ /*util_blitter_save_geometry_shader(ctx->blitter, (void*)ctx->gs);*/ -+ util_blitter_save_so_targets( -+ ctx->blitter, -+ ctx->num_so_targets, -+ (struct pipe_stream_output_target **)ctx->so_targets); -+ util_blitter_save_rasterizer(ctx->blitter, (void *)ctx->rasterizer); -+ util_blitter_save_viewport(ctx->blitter, &ctx->viewport); -+ util_blitter_save_scissor(ctx->blitter, &ctx->scissor); -+ util_blitter_save_fragment_shader(ctx->blitter, ctx->fs); -+ util_blitter_save_blend(ctx->blitter, (void *)ctx->blend); -+ util_blitter_save_depth_stencil_alpha(ctx->blitter, -+ (void *)ctx->depth_stencil); -+ util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref); -+ util_blitter_save_sample_mask(ctx->blitter, ctx->sample_mask); -+ util_blitter_save_framebuffer(ctx->blitter, &ctx->framebuffer); -+ util_blitter_save_fragment_sampler_states( -+ ctx->blitter, -+ ctx->num_samplers[PIPE_SHADER_FRAGMENT], -+ (void **)ctx->samplers[PIPE_SHADER_FRAGMENT]); -+ util_blitter_save_fragment_sampler_views( -+ ctx->blitter, -+ ctx->num_sampler_views[PIPE_SHADER_FRAGMENT], -+ ctx->sampler_views[PIPE_SHADER_FRAGMENT]); -+ util_blitter_save_render_condition(ctx->blitter, -+ ctx->render_cond_query, -+ ctx->render_cond_cond, -+ ctx->render_cond_mode); -+ -+ util_blitter_blit(ctx->blitter, &info); -+} -+ -+ -+static void -+swr_destroy(struct pipe_context *pipe) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ if (ctx->blitter) -+ util_blitter_destroy(ctx->blitter); -+ -+ if (ctx->swrContext) -+ SwrDestroyContext(ctx->swrContext); -+ -+ delete ctx->blendJIT; -+ -+ swr_destroy_scratch_buffers(ctx); -+ -+ FREE(ctx); -+} -+ -+ -+static void -+swr_render_condition(struct pipe_context *pipe, -+ struct pipe_query *query, -+ boolean condition, -+ uint mode) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ ctx->render_cond_query = query; -+ ctx->render_cond_mode = mode; -+ ctx->render_cond_cond = condition; -+} -+ -+ -+struct pipe_context * -+swr_create_context(struct pipe_screen *screen, void *priv) -+{ -+ struct swr_context *ctx = CALLOC_STRUCT(swr_context); -+ ctx->blendJIT = -+ new std::unordered_map; -+ -+ SWR_CREATECONTEXT_INFO createInfo; -+ createInfo.driver = GL; -+ createInfo.privateStateSize = sizeof(swr_draw_context); -+ createInfo.pfnLoadTile = swr_LoadHotTile; -+ createInfo.pfnStoreTile = swr_StoreHotTile; -+ createInfo.pfnClearTile = swr_StoreHotTileClear; -+ ctx->swrContext = SwrCreateContext(&createInfo); -+ -+ /* Init Load/Store/ClearTiles Tables */ -+ swr_InitMemoryModule(); -+ -+ if (ctx->swrContext == NULL) -+ goto fail; -+ -+ ctx->pipe.screen = screen; -+ ctx->pipe.destroy = swr_destroy; -+ ctx->pipe.priv = priv; -+ ctx->pipe.create_surface = swr_create_surface; -+ ctx->pipe.surface_destroy = swr_surface_destroy; -+ ctx->pipe.transfer_map = swr_transfer_map; -+ ctx->pipe.transfer_unmap = swr_transfer_unmap; -+ -+ ctx->pipe.transfer_flush_region = u_default_transfer_flush_region; -+ ctx->pipe.transfer_inline_write = u_default_transfer_inline_write; -+ -+ ctx->pipe.resource_copy_region = swr_resource_copy; -+ ctx->pipe.render_condition = swr_render_condition; -+ -+ swr_state_init(&ctx->pipe); -+ swr_clear_init(&ctx->pipe); -+ swr_draw_init(&ctx->pipe); -+ swr_query_init(&ctx->pipe); -+ -+ ctx->pipe.blit = swr_blit; -+ ctx->blitter = util_blitter_create(&ctx->pipe); -+ if (!ctx->blitter) { -+ goto fail; -+ } -+ -+ swr_init_scratch_buffers(ctx); -+ -+ return &ctx->pipe; -+ -+fail: -+ /* Should really validate the init steps and fail gracefully */ -+ swr_destroy(&ctx->pipe); -+ return NULL; -+} -diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h -new file mode 100644 -index 0000000..9d93a6d ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_context.h -@@ -0,0 +1,172 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#ifndef SWR_CONTEXT_H -+#define SWR_CONTEXT_H -+ -+#include "pipe/p_context.h" -+#include "pipe/p_state.h" -+#include "util/u_blitter.h" -+#include "jit_api.h" -+#include "swr_state.h" -+#include -+ -+#define SWR_NEW_BLEND (1 << 0) -+#define SWR_NEW_RASTERIZER (1 << 1) -+#define SWR_NEW_DEPTH_STENCIL_ALPHA (1 << 2) -+#define SWR_NEW_SAMPLER (1 << 3) -+#define SWR_NEW_SAMPLER_VIEW (1 << 4) -+#define SWR_NEW_VS (1 << 5) -+#define SWR_NEW_FS (1 << 6) -+#define SWR_NEW_VSCONSTANTS (1 << 7) -+#define SWR_NEW_FSCONSTANTS (1 << 8) -+#define SWR_NEW_VERTEX (1 << 9) -+#define SWR_NEW_STIPPLE (1 << 10) -+#define SWR_NEW_SCISSOR (1 << 11) -+#define SWR_NEW_VIEWPORT (1 << 12) -+#define SWR_NEW_FRAMEBUFFER (1 << 13) -+#define SWR_NEW_CLIP (1 << 14) -+#define SWR_NEW_SO (1 << 15) -+#define SWR_NEW_ALL 0x0000ffff -+ -+namespace std -+{ -+template <> struct hash { -+ std::size_t operator()(const BLEND_COMPILE_STATE &k) const -+ { -+ return util_hash_crc32(&k, sizeof(k)); -+ } -+}; -+}; -+ -+struct swr_context { -+ struct pipe_context pipe; /**< base class */ -+ -+ HANDLE swrContext; -+ -+ /** Constant state objects */ -+ struct swr_blend_state *blend; -+ struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; -+ struct pipe_depth_stencil_alpha_state *depth_stencil; -+ struct pipe_rasterizer_state *rasterizer; -+ -+ struct swr_vertex_shader *vs; -+ struct swr_fragment_shader *fs; -+ struct swr_vertex_element_state *velems; -+ -+ /** Other rendering state */ -+ struct pipe_blend_color blend_color; -+ struct pipe_stencil_ref stencil_ref; -+ struct pipe_clip_state clip; -+ struct pipe_constant_buffer -+ constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS]; -+ struct pipe_framebuffer_state framebuffer; -+ struct pipe_poly_stipple poly_stipple; -+ struct pipe_scissor_state scissor; -+ struct pipe_sampler_view * -+ sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; -+ -+ struct pipe_viewport_state viewport; -+ struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; -+ struct pipe_index_buffer index_buffer; -+ -+ struct blitter_context *blitter; -+ -+ /** Conditional query object and mode */ -+ struct pipe_query *render_cond_query; -+ uint render_cond_mode; -+ boolean render_cond_cond; -+ unsigned active_queries; -+ -+ unsigned num_vertex_buffers; -+ unsigned num_samplers[PIPE_SHADER_TYPES]; -+ unsigned num_sampler_views[PIPE_SHADER_TYPES]; -+ -+ unsigned sample_mask; -+ -+ // streamout -+ pipe_stream_output_target *so_targets[MAX_SO_STREAMS]; -+ uint32_t num_so_targets; -+ -+ /* Temp storage for user_buffer constants */ -+ struct swr_scratch_buffers *scratch; -+ -+ // blend jit functions -+ std::unordered_map *blendJIT; -+ -+ /* Shadows of current SWR API DrawState */ -+ struct swr_shadow_state current; -+ -+ unsigned dirty; /**< Mask of SWR_NEW_x flags */ -+}; -+ -+struct swr_jit_texture { -+ uint32_t width; // same as number of elements -+ uint32_t height; -+ uint32_t depth; // doubles as array size -+ uint32_t first_level; -+ uint32_t last_level; -+ const void *base_ptr; -+ uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS]; -+ uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS]; -+ uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; -+}; -+ -+struct swr_jit_sampler { -+ float min_lod; -+ float max_lod; -+ float lod_bias; -+ float border_color[4]; -+}; -+ -+struct swr_draw_context { -+ const float *constantVS[PIPE_MAX_CONSTANT_BUFFERS]; -+ unsigned num_constantsVS[PIPE_MAX_CONSTANT_BUFFERS]; -+ const float *constantFS[PIPE_MAX_CONSTANT_BUFFERS]; -+ unsigned num_constantsFS[PIPE_MAX_CONSTANT_BUFFERS]; -+ -+ swr_jit_texture texturesVS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; -+ swr_jit_sampler samplersVS[PIPE_MAX_SAMPLERS]; -+ swr_jit_texture texturesFS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; -+ swr_jit_sampler samplersFS[PIPE_MAX_SAMPLERS]; -+ -+ SWR_SURFACE_STATE renderTargets[SWR_NUM_ATTACHMENTS]; -+}; -+ -+ -+static INLINE struct swr_context * -+swr_context(struct pipe_context *pipe) -+{ -+ return (struct swr_context *)pipe; -+} -+ -+struct pipe_context *swr_create_context(struct pipe_screen *, void *priv); -+ -+void swr_state_init(struct pipe_context *pipe); -+ -+void swr_clear_init(struct pipe_context *pipe); -+ -+void swr_draw_init(struct pipe_context *pipe); -+ -+void swr_finish(struct pipe_context *pipe); -+#endif -diff --git a/src/gallium/drivers/swr/swr_context_llvm.h b/src/gallium/drivers/swr/swr_context_llvm.h -new file mode 100644 -index 0000000..58da813 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_context_llvm.h -@@ -0,0 +1,124 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#pragma once -+ -+////////////////////////////////////////////////////////////////////////// -+/// Generate LLVM type information for swr_jit_texture -+INLINE static StructType * -+Gen_swr_jit_texture(JitManager *pShG) -+{ -+ LLVMContext &ctx = pShG->mContext; -+ std::vector members; -+ -+ members.push_back(Type::getInt32Ty(ctx)); // width -+ members.push_back(Type::getInt32Ty(ctx)); // height -+ members.push_back(Type::getInt32Ty(ctx)); // depth -+ members.push_back(Type::getInt32Ty(ctx)); // first_level -+ members.push_back(Type::getInt32Ty(ctx)); // last_level -+ members.push_back(PointerType::get(Type::getInt8Ty(ctx), 0)); // base_ptr -+ members.push_back(ArrayType::get(Type::getInt32Ty(ctx), -+ PIPE_MAX_TEXTURE_LEVELS)); // row_stride -+ members.push_back(ArrayType::get(Type::getInt32Ty(ctx), -+ PIPE_MAX_TEXTURE_LEVELS)); // img_stride -+ members.push_back(ArrayType::get(Type::getInt32Ty(ctx), -+ PIPE_MAX_TEXTURE_LEVELS)); // mip_offsets -+ -+ return StructType::get(ctx, members, false); -+} -+ -+static const UINT swr_jit_texture_width = 0; -+static const UINT swr_jit_texture_height = 1; -+static const UINT swr_jit_texture_depth = 2; -+static const UINT swr_jit_texture_first_level = 3; -+static const UINT swr_jit_texture_last_level = 4; -+static const UINT swr_jit_texture_base_ptr = 5; -+static const UINT swr_jit_texture_row_stride = 6; -+static const UINT swr_jit_texture_img_stride = 7; -+static const UINT swr_jit_texture_mip_offsets = 8; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Generate LLVM type information for swr_jit_sampler -+INLINE static StructType * -+Gen_swr_jit_sampler(JitManager *pShG) -+{ -+ LLVMContext &ctx = pShG->mContext; -+ std::vector members; -+ -+ members.push_back(Type::getFloatTy(ctx)); // min_lod -+ members.push_back(Type::getFloatTy(ctx)); // max_lod -+ members.push_back(Type::getFloatTy(ctx)); // lod_bias -+ members.push_back( -+ ArrayType::get(Type::getFloatTy(ctx), 4)); // border_color -+ -+ return StructType::get(ctx, members, false); -+} -+ -+static const UINT swr_jit_sampler_min_lod = 0; -+static const UINT swr_jit_sampler_max_lod = 1; -+static const UINT swr_jit_sampler_lod_bias = 2; -+static const UINT swr_jit_sampler_border_color = 3; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Generate LLVM type information for swr_draw_context -+INLINE static StructType * -+Gen_swr_draw_context(JitManager *pShG) -+{ -+ LLVMContext &ctx = pShG->mContext; -+ std::vector members; -+ -+ members.push_back( -+ ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0), -+ PIPE_MAX_CONSTANT_BUFFERS)); // constantVS -+ members.push_back(ArrayType::get( -+ Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsVS -+ members.push_back( -+ ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0), -+ PIPE_MAX_CONSTANT_BUFFERS)); // constantFS -+ members.push_back(ArrayType::get( -+ Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsFS -+ members.push_back( -+ ArrayType::get(Gen_swr_jit_texture(pShG), -+ PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesVS -+ members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG), -+ PIPE_MAX_SAMPLERS)); // samplersVS -+ members.push_back( -+ ArrayType::get(Gen_swr_jit_texture(pShG), -+ PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesFS -+ members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG), -+ PIPE_MAX_SAMPLERS)); // samplersFS -+ members.push_back(ArrayType::get(Gen_SWR_SURFACE_STATE(pShG), -+ SWR_NUM_ATTACHMENTS)); // renderTargets -+ -+ return StructType::get(ctx, members, false); -+} -+ -+static const UINT swr_draw_context_constantVS = 0; -+static const UINT swr_draw_context_num_constantsVS = 1; -+static const UINT swr_draw_context_constantFS = 2; -+static const UINT swr_draw_context_num_constantsFS = 3; -+static const UINT swr_draw_context_texturesVS = 4; -+static const UINT swr_draw_context_samplersVS = 5; -+static const UINT swr_draw_context_texturesFS = 6; -+static const UINT swr_draw_context_samplersFS = 7; -+static const UINT swr_draw_context_renderTargets = 8; -diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp -new file mode 100644 -index 0000000..797ebdc ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_draw.cpp -@@ -0,0 +1,277 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#include "swr_screen.h" -+#include "swr_context.h" -+#include "swr_resource.h" -+#include "swr_fence.h" -+#include "swr_query.h" -+#include "jit_api.h" -+ -+#include "util/u_draw.h" -+#include "util/u_prim.h" -+ -+/* -+ * Convert mesa PIPE_PRIM_X to SWR enum PRIMITIVE_TOPOLOGY -+ */ -+static INLINE enum PRIMITIVE_TOPOLOGY -+swr_convert_prim_topology(const unsigned mode) -+{ -+ switch (mode) { -+ case PIPE_PRIM_POINTS: -+ return TOP_POINT_LIST; -+ case PIPE_PRIM_LINES: -+ return TOP_LINE_LIST; -+ case PIPE_PRIM_LINE_LOOP: -+ return TOP_LINE_LOOP; -+ case PIPE_PRIM_LINE_STRIP: -+ return TOP_LINE_STRIP; -+ case PIPE_PRIM_TRIANGLES: -+ return TOP_TRIANGLE_LIST; -+ case PIPE_PRIM_TRIANGLE_STRIP: -+ return TOP_TRIANGLE_STRIP; -+ case PIPE_PRIM_TRIANGLE_FAN: -+ return TOP_TRIANGLE_FAN; -+ case PIPE_PRIM_QUADS: -+ return TOP_QUAD_LIST; -+ case PIPE_PRIM_QUAD_STRIP: -+ return TOP_QUAD_STRIP; -+ case PIPE_PRIM_POLYGON: -+ return TOP_TRIANGLE_FAN; /* XXX TOP_POLYGON; */ -+ case PIPE_PRIM_LINES_ADJACENCY: -+ return TOP_LINE_LIST_ADJ; -+ case PIPE_PRIM_LINE_STRIP_ADJACENCY: -+ return TOP_LISTSTRIP_ADJ; -+ case PIPE_PRIM_TRIANGLES_ADJACENCY: -+ return TOP_TRI_LIST_ADJ; -+ case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: -+ return TOP_TRI_STRIP_ADJ; -+ default: -+ assert(0 && "Unknown topology"); -+ return TOP_UNKNOWN; -+ } -+}; -+ -+ -+/* -+ * Draw vertex arrays, with optional indexing, optional instancing. -+ */ -+static void -+swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ if (!swr_check_render_cond(pipe)) -+ return; -+ -+ if (info->indirect) { -+ util_draw_indirect(pipe, info); -+ return; -+ } -+ -+ /* Update derived state, pass draw info to update function */ -+ if (ctx->dirty) -+ swr_update_derived(ctx, info); -+ -+ if (ctx->vs->pipe.stream_output.num_outputs) { -+ if (!ctx->vs->soFunc[info->mode]) { -+ STREAMOUT_COMPILE_STATE state = {0}; -+ struct pipe_stream_output_info *so = &ctx->vs->pipe.stream_output; -+ -+ state.numVertsPerPrim = u_vertices_per_prim(info->mode); -+ -+ uint32_t offsets[MAX_SO_STREAMS] = {0}; -+ uint32_t num = 0; -+ -+ for (uint32_t i = 0; i < so->num_outputs; i++) { -+ assert(so->output[i].stream == 0); // @todo -+ uint32_t output_buffer = so->output[i].output_buffer; -+ if (so->output[i].dst_offset != offsets[output_buffer]) { -+ // hole - need to fill -+ state.stream.decl[num].bufferIndex = output_buffer; -+ state.stream.decl[num].hole = true; -+ state.stream.decl[num].componentMask = -+ (1 << (so->output[i].dst_offset - offsets[output_buffer])) -+ - 1; -+ num++; -+ offsets[output_buffer] = so->output[i].dst_offset; -+ } -+ -+ state.stream.decl[num].bufferIndex = output_buffer; -+ state.stream.decl[num].attribSlot = so->output[i].register_index - 1; -+ state.stream.decl[num].componentMask = -+ ((1 << so->output[i].num_components) - 1) -+ << so->output[i].start_component; -+ state.stream.decl[num].hole = false; -+ num++; -+ -+ offsets[output_buffer] += so->output[i].num_components; -+ } -+ -+ state.stream.numDecls = num; -+ -+ HANDLE hJitMgr = swr_screen(pipe->screen)->hJitMgr; -+ ctx->vs->soFunc[info->mode] = JitCompileStreamout(hJitMgr, state); -+ debug_printf("so shader %p\n", ctx->vs->soFunc[info->mode]); -+ assert(ctx->vs->soFunc[info->mode] && "Error: SoShader = NULL"); -+ } -+ -+ SwrSetSoFunc(ctx->swrContext, ctx->vs->soFunc[info->mode], 0); -+ } -+ -+ struct swr_vertex_element_state *velems = ctx->velems; -+ if (!velems->fsFunc -+ || (velems->fsState.cutIndex != info->restart_index) -+ || (velems->fsState.bEnableCutIndex != info->primitive_restart)) { -+ -+ velems->fsState.cutIndex = info->restart_index; -+ velems->fsState.bEnableCutIndex = info->primitive_restart; -+ -+ /* Create Fetch Shader */ -+ HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr; -+ velems->fsFunc = JitCompileFetch(hJitMgr, velems->fsState); -+ -+ debug_printf("fetch shader %p\n", velems->fsFunc); -+ assert(velems->fsFunc && "Error: FetchShader = NULL"); -+ } -+ -+ SwrSetFetchFunc(ctx->swrContext, velems->fsFunc); -+ -+ if (info->indexed) -+ SwrDrawIndexedInstanced(ctx->swrContext, -+ swr_convert_prim_topology(info->mode), -+ info->count, -+ info->instance_count, -+ info->start, -+ info->index_bias, -+ info->start_instance); -+ else -+ SwrDrawInstanced(ctx->swrContext, -+ swr_convert_prim_topology(info->mode), -+ info->count, -+ info->instance_count, -+ info->start, -+ info->start_instance); -+} -+ -+ -+static void -+swr_flush(struct pipe_context *pipe, -+ struct pipe_fence_handle **fence, -+ unsigned flags) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ struct swr_screen *screen = swr_screen(pipe->screen); -+ -+ /* If the current renderTarget is the display surface, store tiles back to -+ * the surface, in -+ * preparation for present (swr_flush_frontbuffer) -+ */ -+ struct pipe_surface *cb = ctx->framebuffer.cbufs[0]; -+ if (cb && swr_resource(cb->texture)->display_target) -+ swr_store_render_target(ctx, SWR_ATTACHMENT_COLOR0, SWR_TILE_RESOLVED); -+ -+ // SwrStoreTiles is asynchronous, always submit the "flush" fence. -+ // flush_frontbuffer needs it. -+ swr_fence_submit(ctx, screen->flush_fence); -+ -+ if (fence) -+ swr_fence_reference(pipe->screen, fence, screen->flush_fence); -+} -+ -+void -+swr_finish(struct pipe_context *pipe) -+{ -+ struct swr_screen *screen = swr_screen(pipe->screen); -+ struct pipe_fence_handle *fence = NULL; -+ -+ swr_flush(pipe, &fence, 0); -+ swr_fence_finish(&screen->base, fence, 0); -+ swr_fence_reference(&screen->base, &fence, NULL); -+} -+ -+ -+/* -+ * Store SWR HotTiles back to RenderTarget surface. -+ */ -+void -+swr_store_render_target(struct swr_context *ctx, -+ uint32_t attachment, -+ enum SWR_TILE_STATE post_tile_state, -+ struct SWR_SURFACE_STATE *surface) -+{ -+ struct swr_draw_context *pDC = -+ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); -+ struct SWR_SURFACE_STATE *renderTarget = &pDC->renderTargets[attachment]; -+ -+ /* If the passed in surface isn't already attached, it will be attached and -+ * then restored. */ -+ if (surface && (surface != ctx->current.attachment[attachment])) -+ *renderTarget = *surface; -+ -+ /* Only proceed if there's a valid surface to store to */ -+ if (renderTarget->pBaseAddress) { -+ /* Set viewport to full renderTarget width/height and disable scissor -+ * before StoreTiles */ -+ boolean change_viewport = -+ (ctx->current.vp.x != 0.0f || ctx->current.vp.y != 0.0f -+ || ctx->current.vp.width != renderTarget->width -+ || ctx->current.vp.height != renderTarget->height); -+ if (change_viewport) { -+ SWR_VIEWPORT vp = {0}; -+ vp.width = renderTarget->width; -+ vp.height = renderTarget->height; -+ SwrSetViewports(ctx->swrContext, 1, &vp, NULL); -+ } -+ -+ boolean scissor_enable = ctx->current.rastState.scissorEnable; -+ if (scissor_enable) { -+ ctx->current.rastState.scissorEnable = FALSE; -+ SwrSetRastState(ctx->swrContext, &ctx->current.rastState); -+ } -+ -+ SwrStoreTiles(ctx->swrContext, -+ (enum SWR_RENDERTARGET_ATTACHMENT)attachment, -+ post_tile_state); -+ -+ /* Restore viewport and scissor enable */ -+ if (change_viewport) -+ SwrSetViewports(ctx->swrContext, 1, &ctx->current.vp, &ctx->current.vpm); -+ if (scissor_enable) { -+ ctx->current.rastState.scissorEnable = scissor_enable; -+ SwrSetRastState(ctx->swrContext, &ctx->current.rastState); -+ } -+ -+ /* Restore surface attachment, if changed */ -+ if (surface && (surface != ctx->current.attachment[attachment])) -+ *renderTarget = *ctx->current.attachment[attachment]; -+ } -+} -+ -+ -+void -+swr_draw_init(struct pipe_context *pipe) -+{ -+ pipe->draw_vbo = swr_draw_vbo; -+ pipe->flush = swr_flush; -+} -diff --git a/src/gallium/drivers/swr/swr_fence.cpp b/src/gallium/drivers/swr/swr_fence.cpp -new file mode 100644 -index 0000000..aaf7223 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_fence.cpp -@@ -0,0 +1,141 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#include "pipe/p_screen.h" -+#include "util/u_memory.h" -+#include "os/os_time.h" -+ -+#include "swr_context.h" -+#include "swr_screen.h" -+#include "swr_fence.h" -+ -+ -+/* -+ * Fence callback, called by back-end thread on completion of all rendering up -+ * to SwrSync call. -+ */ -+static void -+swr_sync_cb(UINT64 userData, UINT64 userData2) -+{ -+ struct swr_fence *fence = (struct swr_fence *)userData; -+ -+ fence->read = fence->write; -+} -+ -+/* -+ * Submit an existing fence. -+ */ -+void -+swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fh) -+{ -+ struct swr_fence *fence = swr_fence(fh); -+ -+ fence->write++; -+ SwrSync(ctx->swrContext, swr_sync_cb, (UINT64)fence, 0); -+} -+ -+/* -+ * Create a new fence object. -+ */ -+struct pipe_fence_handle * -+swr_fence_create() -+{ -+ static int fence_id = 0; -+ struct swr_fence *fence = CALLOC_STRUCT(swr_fence); -+ if (!fence) -+ return NULL; -+ -+ memset(fence, 0, sizeof(*fence)); -+ pipe_reference_init(&fence->reference, 1); -+ fence->id = fence_id++; -+ -+ return (struct pipe_fence_handle *)fence; -+} -+ -+/** Destroy a fence. Called when refcount hits zero. */ -+static void -+swr_fence_destroy(struct swr_fence *fence) -+{ -+ FREE(fence); -+} -+ -+/** -+ * Set ptr = fence, with reference counting -+ */ -+void -+swr_fence_reference(struct pipe_screen *screen, -+ struct pipe_fence_handle **ptr, -+ struct pipe_fence_handle *f) -+{ -+ struct swr_fence *fence = swr_fence(f); -+ struct swr_fence *old; -+ -+ if (likely(ptr)) { -+ old = swr_fence(*ptr); -+ *ptr = f; -+ } else { -+ old = NULL; -+ } -+ -+ if (pipe_reference(&old->reference, &fence->reference)) -+ swr_fence_destroy(old); -+} -+ -+/* -+ * Wait for the fence to finish. -+ */ -+boolean -+swr_fence_finish(struct pipe_screen *screen, -+ struct pipe_fence_handle *fence_handle, -+ uint64_t timeout) -+{ -+ struct swr_fence *fence = swr_fence(fence_handle); -+ -+ while (!swr_is_fence_done(fence)) -+ sched_yield(); -+ -+ return TRUE; -+} -+ -+ -+uint64_t -+swr_get_timestamp(struct pipe_screen *screen) -+{ -+ return os_time_get_nano(); -+} -+ -+ -+void -+swr_fence_init(struct pipe_screen *p_screen) -+{ -+ p_screen->fence_reference = swr_fence_reference; -+ p_screen->fence_finish = swr_fence_finish; -+ -+ p_screen->get_timestamp = swr_get_timestamp; -+ -+ /* -+ * Create persistant "flush" fence, submitted when swr_flush is called. -+ */ -+ struct swr_screen *screen = swr_screen(p_screen); -+ screen->flush_fence = swr_fence_create(); -+} -diff --git a/src/gallium/drivers/swr/swr_fence.h b/src/gallium/drivers/swr/swr_fence.h -new file mode 100644 -index 0000000..317d74c ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_fence.h -@@ -0,0 +1,73 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice shall be included -+ * in all copies or substantial portions of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN -+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#ifndef SWR_FENCE_H -+#define SWR_FENCE_H -+ -+ -+#include "os/os_thread.h" -+#include "pipe/p_state.h" -+#include "util/u_inlines.h" -+ -+ -+struct pipe_screen; -+ -+struct swr_fence { -+ struct pipe_reference reference; -+ -+ uint64_t read; -+ uint64_t write; -+ -+ unsigned id; /* Just for reference */ -+}; -+ -+ -+static inline struct swr_fence * -+swr_fence(struct pipe_fence_handle *fence) -+{ -+ return (struct swr_fence *)fence; -+} -+ -+static INLINE boolean -+swr_is_fence_done(struct swr_fence *fence) -+{ -+ return (fence->read == fence->write); -+} -+ -+ -+void swr_fence_init(struct pipe_screen *screen); -+ -+struct pipe_fence_handle *swr_fence_create(); -+ -+void swr_fence_reference(struct pipe_screen *screen, -+ struct pipe_fence_handle **ptr, -+ struct pipe_fence_handle *f); -+ -+boolean swr_fence_finish(struct pipe_screen *screen, -+ struct pipe_fence_handle *fence_handle, -+ uint64_t timeout); -+ -+void -+swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fence); -+ -+uint64_t swr_get_timestamp(struct pipe_screen *screen); -+ -+#endif -diff --git a/src/gallium/drivers/swr/swr_memory.h b/src/gallium/drivers/swr/swr_memory.h -new file mode 100644 -index 0000000..d116781 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_memory.h -@@ -0,0 +1,99 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#pragma once -+ -+void LoadHotTile( -+ SWR_SURFACE_STATE *pSrcSurface, -+ SWR_FORMAT dstFormat, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ UINT x, UINT y, uint32_t renderTargetArrayIndex, -+ BYTE *pDstHotTile); -+ -+void StoreHotTile( -+ SWR_SURFACE_STATE *pDstSurface, -+ SWR_FORMAT srcFormat, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ UINT x, UINT y, uint32_t renderTargetArrayIndex, -+ BYTE *pSrcHotTile); -+ -+void StoreHotTileClear( -+ SWR_SURFACE_STATE *pDstSurface, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ UINT x, -+ UINT y, -+ const float* pClearColor); -+ -+INLINE void -+swr_LoadHotTile(HANDLE hPrivateContext, -+ SWR_FORMAT dstFormat, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ UINT x, UINT y, -+ uint32_t renderTargetArrayIndex, BYTE* pDstHotTile) -+{ -+ // Grab source surface state from private context -+ swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; -+ SWR_SURFACE_STATE *pSrcSurface = &pDC->renderTargets[renderTargetIndex]; -+ -+ LoadHotTile(pSrcSurface, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile); -+} -+ -+INLINE void -+swr_StoreHotTile(HANDLE hPrivateContext, -+ SWR_FORMAT srcFormat, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ UINT x, UINT y, -+ uint32_t renderTargetArrayIndex, BYTE* pSrcHotTile) -+{ -+ // Grab destination surface state from private context -+ swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; -+ SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex]; -+ -+ StoreHotTile(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile); -+} -+ -+INLINE void -+swr_StoreHotTileClear(HANDLE hPrivateContext, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ UINT x, -+ UINT y, -+ const float* pClearColor) -+{ -+ // Grab destination surface state from private context -+ swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; -+ SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex]; -+ -+ StoreHotTileClear(pDstSurface, renderTargetIndex, x, y, pClearColor); -+} -+ -+void InitSimLoadTilesTable(); -+void InitSimStoreTilesTable(); -+void InitSimClearTilesTable(); -+ -+/* Init Load/Store/ClearTiles Tables */ -+INLINE void swr_InitMemoryModule() -+{ -+ InitSimLoadTilesTable(); -+ InitSimStoreTilesTable(); -+ InitSimClearTilesTable(); -+} -diff --git a/src/gallium/drivers/swr/swr_public.h b/src/gallium/drivers/swr/swr_public.h -new file mode 100644 -index 0000000..4d56ead ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_public.h -@@ -0,0 +1,40 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#ifndef SWR_PUBLIC_H -+#define SWR_PUBLIC_H -+ -+struct pipe_screen; -+struct sw_winsys; -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+struct pipe_screen *swr_create_screen(struct sw_winsys *winsys); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif -diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp -new file mode 100644 -index 0000000..2510b3a ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_query.cpp -@@ -0,0 +1,334 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#include "pipe/p_defines.h" -+#include "util/u_memory.h" -+#include "os/os_time.h" -+#include "swr_context.h" -+#include "swr_fence.h" -+#include "swr_query.h" -+#include "swr_screen.h" -+#include "swr_state.h" -+ -+ -+static struct swr_query * -+swr_query(struct pipe_query *p) -+{ -+ return (struct swr_query *)p; -+} -+ -+static struct pipe_query * -+swr_create_query(struct pipe_context *pipe, unsigned type, unsigned index) -+{ -+ struct swr_query *pq; -+ -+ assert(type < PIPE_QUERY_TYPES); -+ assert(index < MAX_SO_STREAMS); -+ -+ pq = CALLOC_STRUCT(swr_query); -+ -+ if (pq) { -+ pq->type = type; -+ pq->index = index; -+ } -+ -+ return (struct pipe_query *)pq; -+} -+ -+ -+static void -+swr_destroy_query(struct pipe_context *pipe, struct pipe_query *q) -+{ -+ struct swr_query *pq = swr_query(q); -+ -+ if (pq->fence) { -+ if (!swr_is_fence_done(swr_fence(pq->fence))) { -+ swr_fence_submit(swr_context(pipe), pq->fence); -+ swr_fence_finish(pipe->screen, pq->fence, 0); -+ } -+ swr_fence_reference(pipe->screen, &pq->fence, NULL); -+ } -+ -+ FREE(pq); -+} -+ -+ -+// XXX Create a fence callback, rather than stalling SwrWaitForIdle -+static void -+swr_gather_stats(struct pipe_context *pipe, struct swr_query *pq) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ assert(pq->result); -+ union pipe_query_result *result = pq->result; -+ boolean enable_stats = pq->enable_stats; -+ SWR_STATS swr_stats = {0}; -+ -+ if (pq->fence) { -+ if (!swr_is_fence_done(swr_fence(pq->fence))) { -+ swr_fence_submit(ctx, pq->fence); -+ swr_fence_finish(pipe->screen, pq->fence, 0); -+ } -+ swr_fence_reference(pipe->screen, &pq->fence, NULL); -+ } -+ -+ /* -+ * These queries don't need SWR Stats enabled in the core -+ * Set and return. -+ */ -+ switch (pq->type) { -+ case PIPE_QUERY_TIMESTAMP: -+ case PIPE_QUERY_TIME_ELAPSED: -+ result->u64 = swr_get_timestamp(pipe->screen); -+ return; -+ break; -+ case PIPE_QUERY_TIMESTAMP_DISJOINT: -+ /* nothing to do here */ -+ return; -+ break; -+ case PIPE_QUERY_GPU_FINISHED: -+ result->b = TRUE; /* XXX TODO Add an api func to SWR to compare drawId -+ vs LastRetiredId? */ -+ return; -+ break; -+ default: -+ /* Any query that needs SwrCore stats */ -+ break; -+ } -+ -+ /* -+ * All other results are collected from SwrCore counters -+ */ -+ -+ /* XXX, Should turn this into a fence callback and skip the stall */ -+ SwrGetStats(ctx->swrContext, &swr_stats); -+ /* SwrGetStats returns immediately, wait for collection */ -+ SwrWaitForIdle(ctx->swrContext); -+ -+ switch (pq->type) { -+ case PIPE_QUERY_OCCLUSION_PREDICATE: -+ case PIPE_QUERY_OCCLUSION_COUNTER: -+ result->u64 = swr_stats.DepthPassCount; -+ break; -+ case PIPE_QUERY_PRIMITIVES_GENERATED: -+ result->u64 = swr_stats.IaPrimitives; -+ break; -+ case PIPE_QUERY_PRIMITIVES_EMITTED: -+ result->u64 = swr_stats.SoNumPrimsWritten[pq->index]; -+ break; -+ case PIPE_QUERY_SO_STATISTICS: -+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE: { -+ struct pipe_query_data_so_statistics *so_stats = &result->so_statistics; -+ so_stats->num_primitives_written = -+ swr_stats.SoNumPrimsWritten[pq->index]; -+ so_stats->primitives_storage_needed = -+ swr_stats.SoPrimStorageNeeded[pq->index]; -+ } break; -+ case PIPE_QUERY_PIPELINE_STATISTICS: { -+ struct pipe_query_data_pipeline_statistics *p_stats = -+ &result->pipeline_statistics; -+ p_stats->ia_vertices = swr_stats.IaVertices; -+ p_stats->ia_primitives = swr_stats.IaPrimitives; -+ p_stats->vs_invocations = swr_stats.VsInvocations; -+ p_stats->gs_invocations = swr_stats.GsInvocations; -+ p_stats->gs_primitives = swr_stats.GsPrimitives; -+ p_stats->c_invocations = swr_stats.CPrimitives; -+ p_stats->c_primitives = swr_stats.CPrimitives; -+ p_stats->ps_invocations = swr_stats.PsInvocations; -+ p_stats->hs_invocations = swr_stats.HsInvocations; -+ p_stats->ds_invocations = swr_stats.DsInvocations; -+ p_stats->cs_invocations = swr_stats.CsInvocations; -+ } break; -+ default: -+ assert(0 && "Unsupported query"); -+ break; -+ } -+ -+ /* Only change stat collection if there are no active queries */ -+ if (ctx->active_queries == 0) -+ SwrEnableStats(ctx->swrContext, enable_stats); -+} -+ -+ -+static boolean -+swr_get_query_result(struct pipe_context *pipe, -+ struct pipe_query *q, -+ boolean wait, -+ union pipe_query_result *result) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ struct swr_query *pq = swr_query(q); -+ -+ if (pq->fence) { -+ if (!swr_is_fence_done(swr_fence(pq->fence))) { -+ swr_fence_submit(ctx, pq->fence); -+ if (!wait) -+ return FALSE; -+ swr_fence_finish(pipe->screen, pq->fence, 0); -+ } -+ swr_fence_reference(pipe->screen, &pq->fence, NULL); -+ } -+ -+ /* XXX: Need to handle counter rollover */ -+ -+ switch (pq->type) { -+ /* Booleans */ -+ case PIPE_QUERY_OCCLUSION_PREDICATE: -+ result->b = pq->end.u64 != pq->start.u64 ? TRUE : FALSE; -+ break; -+ case PIPE_QUERY_GPU_FINISHED: -+ result->b = pq->end.b; -+ break; -+ /* Counters */ -+ case PIPE_QUERY_OCCLUSION_COUNTER: -+ case PIPE_QUERY_TIMESTAMP: -+ case PIPE_QUERY_TIME_ELAPSED: -+ case PIPE_QUERY_PRIMITIVES_GENERATED: -+ case PIPE_QUERY_PRIMITIVES_EMITTED: -+ result->u64 = pq->end.u64 - pq->start.u64; -+ break; -+ /* Structures */ -+ case PIPE_QUERY_SO_STATISTICS: { -+ struct pipe_query_data_so_statistics *so_stats = &result->so_statistics; -+ struct pipe_query_data_so_statistics *start = &pq->start.so_statistics; -+ struct pipe_query_data_so_statistics *end = &pq->end.so_statistics; -+ so_stats->num_primitives_written = -+ end->num_primitives_written - start->num_primitives_written; -+ so_stats->primitives_storage_needed = -+ end->primitives_storage_needed - start->primitives_storage_needed; -+ } break; -+ case PIPE_QUERY_TIMESTAMP_DISJOINT: { -+ /* os_get_time_nano returns nanoseconds */ -+ result->timestamp_disjoint.frequency = UINT64_C(1000000000); -+ result->timestamp_disjoint.disjoint = FALSE; -+ } break; -+ case PIPE_QUERY_PIPELINE_STATISTICS: { -+ struct pipe_query_data_pipeline_statistics *p_stats = -+ &result->pipeline_statistics; -+ struct pipe_query_data_pipeline_statistics *start = -+ &pq->start.pipeline_statistics; -+ struct pipe_query_data_pipeline_statistics *end = -+ &pq->end.pipeline_statistics; -+ p_stats->ia_vertices = end->ia_vertices - start->ia_vertices; -+ p_stats->ia_primitives = end->ia_primitives - start->ia_primitives; -+ p_stats->vs_invocations = end->vs_invocations - start->vs_invocations; -+ p_stats->gs_invocations = end->gs_invocations - start->gs_invocations; -+ p_stats->gs_primitives = end->gs_primitives - start->gs_primitives; -+ p_stats->c_invocations = end->c_invocations - start->c_invocations; -+ p_stats->c_primitives = end->c_primitives - start->c_primitives; -+ p_stats->ps_invocations = end->ps_invocations - start->ps_invocations; -+ p_stats->hs_invocations = end->hs_invocations - start->hs_invocations; -+ p_stats->ds_invocations = end->ds_invocations - start->ds_invocations; -+ p_stats->cs_invocations = end->cs_invocations - start->cs_invocations; -+ } break; -+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE: { -+ struct pipe_query_data_so_statistics *start = &pq->start.so_statistics; -+ struct pipe_query_data_so_statistics *end = &pq->end.so_statistics; -+ uint64_t num_primitives_written = -+ end->num_primitives_written - start->num_primitives_written; -+ uint64_t primitives_storage_needed = -+ end->primitives_storage_needed - start->primitives_storage_needed; -+ result->b = num_primitives_written > primitives_storage_needed; -+ } break; -+ default: -+ assert(0 && "Unsupported query"); -+ break; -+ } -+ -+ return TRUE; -+} -+ -+static boolean -+swr_begin_query(struct pipe_context *pipe, struct pipe_query *q) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ struct swr_query *pq = swr_query(q); -+ -+ /* Initialize Results */ -+ memset(&pq->start, 0, sizeof(pq->start)); -+ memset(&pq->end, 0, sizeof(pq->end)); -+ -+ /* Gather start stats and enable SwrCore counters */ -+ pq->result = &pq->start; -+ pq->enable_stats = TRUE; -+ swr_gather_stats(pipe, pq); -+ ctx->active_queries++; -+ -+ /* override start timestamp to 0 for TIMESTAMP query */ -+ if (pq->type == PIPE_QUERY_TIMESTAMP) -+ pq->start.u64 = 0; -+ -+ return true; -+} -+ -+static void -+swr_end_query(struct pipe_context *pipe, struct pipe_query *q) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ struct swr_query *pq = swr_query(q); -+ -+ assert(ctx->active_queries -+ && "swr_end_query, there are no active queries!"); -+ ctx->active_queries--; -+ -+ /* Gather end stats and disable SwrCore counters */ -+ pq->result = &pq->end; -+ pq->enable_stats = FALSE; -+ swr_gather_stats(pipe, pq); -+} -+ -+ -+boolean -+swr_check_render_cond(struct pipe_context *pipe) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ boolean b, wait; -+ uint64_t result; -+ -+ if (!ctx->render_cond_query) -+ return TRUE; /* no query predicate, draw normally */ -+ -+ wait = (ctx->render_cond_mode == PIPE_RENDER_COND_WAIT -+ || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT); -+ -+ b = pipe->get_query_result( -+ pipe, ctx->render_cond_query, wait, (union pipe_query_result *)&result); -+ if (b) -+ return (!result == ctx->render_cond_cond); -+ else -+ return TRUE; -+} -+ -+void -+swr_query_init(struct pipe_context *pipe) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ pipe->create_query = swr_create_query; -+ pipe->destroy_query = swr_destroy_query; -+ pipe->begin_query = swr_begin_query; -+ pipe->end_query = swr_end_query; -+ pipe->get_query_result = swr_get_query_result; -+ -+ ctx->active_queries = 0; -+} -diff --git a/src/gallium/drivers/swr/swr_query.h b/src/gallium/drivers/swr/swr_query.h -new file mode 100644 -index 0000000..2a2aeee ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_query.h -@@ -0,0 +1,48 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#ifndef SWR_QUERY_H -+#define SWR_QUERY_H -+ -+ -+#include -+#include "os/os_thread.h" -+ -+ -+struct swr_query { -+ unsigned type; /* PIPE_QUERY_* */ -+ unsigned index; -+ -+ union pipe_query_result *result; -+ union pipe_query_result start; -+ union pipe_query_result end; -+ -+ struct pipe_fence_handle *fence; -+ -+ boolean enable_stats; -+}; -+ -+extern void swr_query_init(struct pipe_context *pipe); -+ -+extern boolean swr_check_render_cond(struct pipe_context *pipe); -+#endif -diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h -new file mode 100644 -index 0000000..f7f641e ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_resource.h -@@ -0,0 +1,98 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#ifndef SWR_RESOURCE_H -+#define SWR_RESOURCE_H -+ -+#include "pipe/p_state.h" -+#include "api.h" -+ -+struct sw_displaytarget; -+ -+struct swr_resource { -+ struct pipe_resource base; -+ -+ bool has_depth; -+ bool has_stencil; -+ -+ UINT alignedWidth; -+ UINT alignedHeight; -+ -+ SWR_SURFACE_STATE swr; -+ SWR_SURFACE_STATE secondary; // for faking depth/stencil merged formats -+ -+ struct sw_displaytarget *display_target; -+ -+ unsigned row_stride[PIPE_MAX_TEXTURE_LEVELS]; -+ unsigned img_stride[PIPE_MAX_TEXTURE_LEVELS]; -+ unsigned mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; -+ -+ /* Opaque pointer to swr_context to mark resource in use */ -+ void *bound_to_context; -+}; -+ -+ -+static INLINE struct swr_resource * -+swr_resource(struct pipe_resource *resource) -+{ -+ return (struct swr_resource *)resource; -+} -+ -+static INLINE boolean -+swr_resource_is_texture(const struct pipe_resource *resource) -+{ -+ switch (resource->target) { -+ case PIPE_BUFFER: -+ return FALSE; -+ case PIPE_TEXTURE_1D: -+ case PIPE_TEXTURE_1D_ARRAY: -+ case PIPE_TEXTURE_2D: -+ case PIPE_TEXTURE_2D_ARRAY: -+ case PIPE_TEXTURE_RECT: -+ case PIPE_TEXTURE_3D: -+ case PIPE_TEXTURE_CUBE: -+ case PIPE_TEXTURE_CUBE_ARRAY: -+ return TRUE; -+ default: -+ assert(0); -+ return FALSE; -+ } -+} -+ -+ -+static INLINE void * -+swr_resource_data(struct pipe_resource *resource) -+{ -+ struct swr_resource *swr_r = swr_resource(resource); -+ -+ assert(!swr_resource_is_texture(resource)); -+ -+ return swr_r->swr.pBaseAddress; -+} -+ -+ -+void swr_store_render_target(struct swr_context *ctx, -+ uint32_t attachment, -+ enum SWR_TILE_STATE post_tile_state, -+ struct SWR_SURFACE_STATE *surface = nullptr); -+#endif -diff --git a/src/gallium/drivers/swr/swr_scratch.cpp b/src/gallium/drivers/swr/swr_scratch.cpp -new file mode 100644 -index 0000000..e6c448c ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_scratch.cpp -@@ -0,0 +1,116 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#include "util/u_memory.h" -+#include "swr_context.h" -+#include "swr_scratch.h" -+#include "api.h" -+ -+ -+void * -+swr_copy_to_scratch_space(struct swr_context *ctx, -+ struct swr_scratch_space *space, -+ const void *user_buffer, -+ unsigned int size) -+{ -+ void *ptr; -+ assert(space); -+ assert(user_buffer); -+ assert(size); -+ -+ if (size >= 2048) { /* XXX TODO create KNOB_ for this */ -+ /* Use per draw SwrAllocDrawContextMemory for larger copies */ -+ ptr = SwrAllocDrawContextMemory(ctx->swrContext, size, 4); -+ } else { -+ /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */ -+ unsigned int max_size_in_flight = size * KNOB_MAX_DRAWS_IN_FLIGHT; -+ -+ /* Need to grow space */ -+ if (max_size_in_flight > space->current_size) { -+ /* Must idle the pipeline, this is infrequent */ -+ SwrWaitForIdle(ctx->swrContext); -+ -+ space->current_size = max_size_in_flight; -+ -+ if (space->base) { -+ align_free(space->base); -+ space->base = NULL; -+ } -+ -+ if (!space->base) { -+ space->base = (BYTE *)align_malloc(space->current_size, 4); -+ space->head = (void *)space->base; -+ } -+ } -+ -+ /* Wrap */ -+ if (((BYTE *)space->head + size) -+ >= ((BYTE *)space->base + space->current_size)) { -+ /* -+ * TODO XXX: Should add a fence on wrap. Assumption is that -+ * current_space >> size, and there are at least MAX_DRAWS_IN_FLIGHT -+ * draws in scratch. So fence would always be met on wrap. A fence -+ * would ensure that first frame in buffer is done before wrapping. -+ * If fence ever needs to be waited on, can increase buffer size. -+ * So far in testing, this hasn't been necessary. -+ */ -+ space->head = space->base; -+ } -+ -+ ptr = space->head; -+ space->head = (BYTE *)space->head + size; -+ } -+ -+ /* Copy user_buffer to scratch */ -+ memcpy(ptr, user_buffer, size); -+ -+ return ptr; -+} -+ -+ -+void -+swr_init_scratch_buffers(struct swr_context *ctx) -+{ -+ struct swr_scratch_buffers *scratch; -+ -+ scratch = CALLOC_STRUCT(swr_scratch_buffers); -+ ctx->scratch = scratch; -+} -+ -+void -+swr_destroy_scratch_buffers(struct swr_context *ctx) -+{ -+ struct swr_scratch_buffers *scratch = ctx->scratch; -+ -+ if (scratch) { -+ if (scratch->vs_constants.base) -+ align_free(scratch->vs_constants.base); -+ if (scratch->fs_constants.base) -+ align_free(scratch->fs_constants.base); -+ if (scratch->vertex_buffer.base) -+ align_free(scratch->vertex_buffer.base); -+ if (scratch->index_buffer.base) -+ align_free(scratch->index_buffer.base); -+ FREE(scratch); -+ } -+} -diff --git a/src/gallium/drivers/swr/swr_scratch.h b/src/gallium/drivers/swr/swr_scratch.h -new file mode 100644 -index 0000000..74218d6 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_scratch.h -@@ -0,0 +1,63 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#ifndef SWR_SCRATCH_H -+#define SWR_SCRATCH_H -+ -+struct swr_scratch_space { -+ void *head; -+ unsigned int current_size; -+ /* TODO XXX: Add a fence for wrap condition. */ -+ -+ void *base; -+}; -+ -+struct swr_scratch_buffers { -+ struct swr_scratch_space vs_constants; -+ struct swr_scratch_space fs_constants; -+ struct swr_scratch_space vertex_buffer; -+ struct swr_scratch_space index_buffer; -+}; -+ -+ -+/* -+ * swr_copy_to_scratch_space -+ * Copies size bytes of user_buffer into the scratch ring buffer. -+ * Used to store temporary data such as client arrays and constants. -+ * -+ * Inputs: -+ * space ptr to scratch pool (vs_constants, fs_constants) -+ * user_buffer, data to copy into scratch space -+ * size to be copied -+ * Returns: -+ * pointer to data copied to scratch space. -+ */ -+void *swr_copy_to_scratch_space(struct swr_context *ctx, -+ struct swr_scratch_space *space, -+ const void *user_buffer, -+ unsigned int size); -+ -+void swr_init_scratch_buffers(struct swr_context *ctx); -+void swr_destroy_scratch_buffers(struct swr_context *ctx); -+ -+#endif -diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp -new file mode 100644 -index 0000000..66eb58b ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_screen.cpp -@@ -0,0 +1,666 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#include "pipe/p_screen.h" -+#include "pipe/p_defines.h" -+#include "util/u_memory.h" -+#include "util/u_format.h" -+#include "util/u_inlines.h" -+#include "util/u_cpu_detect.h" -+ -+#include "state_tracker/sw_winsys.h" -+ -+extern "C" { -+#include "gallivm/lp_bld_limits.h" -+} -+ -+#include "swr_public.h" -+#include "swr_screen.h" -+#include "swr_context.h" -+#include "swr_resource.h" -+#include "swr_fence.h" -+#include "gen_knobs.h" -+ -+#include "jit_api.h" -+ -+#include -+ -+static const char * -+swr_get_name(struct pipe_screen *screen) -+{ -+ return "SWR"; -+} -+ -+static const char * -+swr_get_vendor(struct pipe_screen *screen) -+{ -+ return "Intel Corporation"; -+} -+ -+static boolean -+swr_is_format_supported(struct pipe_screen *screen, -+ enum pipe_format format, -+ enum pipe_texture_target target, -+ unsigned sample_count, -+ unsigned bind) -+{ -+ struct sw_winsys *winsys = swr_screen(screen)->winsys; -+ const struct util_format_description *format_desc; -+ -+ assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D -+ || target == PIPE_TEXTURE_1D_ARRAY -+ || target == PIPE_TEXTURE_2D -+ || target == PIPE_TEXTURE_2D_ARRAY -+ || target == PIPE_TEXTURE_RECT -+ || target == PIPE_TEXTURE_3D -+ || target == PIPE_TEXTURE_CUBE -+ || target == PIPE_TEXTURE_CUBE_ARRAY); -+ -+ format_desc = util_format_description(format); -+ if (!format_desc) -+ return FALSE; -+ -+ if (sample_count > 1) -+ return FALSE; -+ -+ if (bind -+ & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)) { -+ if (!winsys->is_displaytarget_format_supported(winsys, bind, format)) -+ return FALSE; -+ } -+ -+ if (bind & PIPE_BIND_RENDER_TARGET) { -+ if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) -+ return FALSE; -+ -+ if (mesa_to_swr_format(format) == (SWR_FORMAT)-1) -+ return FALSE; -+ -+ /* -+ * Although possible, it is unnatural to render into compressed or YUV -+ * surfaces. So disable these here to avoid going into weird paths -+ * inside the state trackers. -+ */ -+ if (format_desc->block.width != 1 || format_desc->block.height != 1) -+ return FALSE; -+ } -+ -+ /* We're going to lie and say we support all depth/stencil formats. -+ * SWR actually needs separate bindings, and only does F32 depth. -+ */ -+ if (bind & PIPE_BIND_DEPTH_STENCIL) { -+ if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) -+ return FALSE; -+ } -+ -+ return TRUE; -+} -+ -+static int -+swr_get_param(struct pipe_screen *screen, enum pipe_cap param) -+{ -+ switch (param) { -+ case PIPE_CAP_NPOT_TEXTURES: -+ case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: -+ return 1; -+ case PIPE_CAP_TWO_SIDED_STENCIL: -+ return 1; -+ case PIPE_CAP_SM3: -+ return 1; -+ case PIPE_CAP_ANISOTROPIC_FILTER: -+ return 0; -+ case PIPE_CAP_POINT_SPRITE: -+ return 1; -+ case PIPE_CAP_MAX_RENDER_TARGETS: -+ return PIPE_MAX_COLOR_BUFS; -+ case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: -+ return 1; -+ case PIPE_CAP_OCCLUSION_QUERY: -+ case PIPE_CAP_QUERY_TIME_ELAPSED: -+ case PIPE_CAP_QUERY_PIPELINE_STATISTICS: -+ return 1; -+ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: -+ return 1; -+ case PIPE_CAP_TEXTURE_SHADOW_MAP: -+ return 1; -+ case PIPE_CAP_TEXTURE_SWIZZLE: -+ return 1; -+ case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: -+ return 0; -+ case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: -+ return 13; // xxx This increases rendertarget max size to 4k x 4k. No -+ // way to separate widht/height. -+ case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: -+ return 12; // xxx -+ case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: -+ return 12; // xxx -+ case PIPE_CAP_BLEND_EQUATION_SEPARATE: -+ return 1; -+ case PIPE_CAP_INDEP_BLEND_ENABLE: -+ return 1; -+ case PIPE_CAP_INDEP_BLEND_FUNC: -+ return 1; -+ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: -+ return 0; // Don't support lower left frag coord. -+ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: -+ case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: -+ case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: -+ return 1; -+ case PIPE_CAP_DEPTH_CLIP_DISABLE: -+ return 1; -+ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: -+ return MAX_SO_STREAMS; -+ case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: -+ case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: -+ return MAX_ATTRIBUTES; -+ case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: -+ case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: -+ return 1024; -+ case PIPE_CAP_MAX_VERTEX_STREAMS: -+ return 1; -+ case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: -+ return 2048; -+ case PIPE_CAP_PRIMITIVE_RESTART: -+ return 1; -+ case PIPE_CAP_SHADER_STENCIL_EXPORT: -+ return 1; -+ case PIPE_CAP_TGSI_INSTANCEID: -+ case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: -+ case PIPE_CAP_START_INSTANCE: -+ return 1; -+ case PIPE_CAP_SEAMLESS_CUBE_MAP: -+ case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: -+ return 1; -+ case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: -+ return 256; /* for GL3 */ -+ case PIPE_CAP_MIN_TEXEL_OFFSET: -+ return -8; -+ case PIPE_CAP_MAX_TEXEL_OFFSET: -+ return 7; -+ case PIPE_CAP_CONDITIONAL_RENDER: -+ return 1; -+ case PIPE_CAP_TEXTURE_BARRIER: -+ return 0; -+ case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: -+ case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: /* draw module */ -+ case PIPE_CAP_VERTEX_COLOR_CLAMPED: /* draw module */ -+ return 1; -+ case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: -+ return 0; -+ case PIPE_CAP_GLSL_FEATURE_LEVEL: -+ return 330; -+ case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: -+ return 0; -+ case PIPE_CAP_COMPUTE: -+ return 0; -+ case PIPE_CAP_USER_VERTEX_BUFFERS: -+ case PIPE_CAP_USER_INDEX_BUFFERS: -+ case PIPE_CAP_USER_CONSTANT_BUFFERS: -+ case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: -+ case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: -+ return 1; -+ case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: -+ return 16; -+ case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: -+ case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: -+ case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: -+ case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: -+ case PIPE_CAP_TEXTURE_MULTISAMPLE: -+ return 0; -+ case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: -+ return 64; -+ case PIPE_CAP_QUERY_TIMESTAMP: -+ return 1; -+ case PIPE_CAP_CUBE_MAP_ARRAY: -+ return 0; -+ case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: -+ return 1; -+ case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: -+ return 65536; -+ case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: -+ return 0; -+ case PIPE_CAP_TGSI_TEXCOORD: -+ case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: -+ return 0; -+ case PIPE_CAP_MAX_VIEWPORTS: -+ return 1; -+ case PIPE_CAP_ENDIANNESS: -+ return PIPE_ENDIAN_NATIVE; -+ case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: -+ case PIPE_CAP_TEXTURE_GATHER_SM5: -+ return 0; -+ case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: -+ return 1; -+ case PIPE_CAP_TEXTURE_QUERY_LOD: -+ case PIPE_CAP_SAMPLE_SHADING: -+ case PIPE_CAP_TEXTURE_GATHER_OFFSETS: -+ case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: -+ case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: -+ case PIPE_CAP_SAMPLER_VIEW_TARGET: -+ return 0; -+ case PIPE_CAP_FAKE_SW_MSAA: -+ return 1; -+ case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: -+ case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: -+ return 0; -+ case PIPE_CAP_DRAW_INDIRECT: -+ return 1; -+ -+ case PIPE_CAP_VENDOR_ID: -+ return 0xFFFFFFFF; -+ case PIPE_CAP_DEVICE_ID: -+ return 0xFFFFFFFF; -+ case PIPE_CAP_ACCELERATED: -+ return 0; -+ case PIPE_CAP_VIDEO_MEMORY: { -+ /* XXX: Do we want to return the full amount of system memory ? */ -+ uint64_t system_memory; -+ -+ if (!os_get_total_physical_memory(&system_memory)) -+ return 0; -+ -+ return (int)(system_memory >> 20); -+ } -+ case PIPE_CAP_UMA: -+ return 1; -+ case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: -+ return 1; -+ case PIPE_CAP_CLIP_HALFZ: -+ return 1; -+ case PIPE_CAP_VERTEXID_NOBASE: -+ return 0; -+ case PIPE_CAP_POLYGON_OFFSET_CLAMP: -+ return 1; -+ case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: -+ return 0; -+ case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: -+ return 0; // xxx -+ case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: -+ return 0; -+ case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: -+ return 0; -+ case PIPE_CAP_DEPTH_BOUNDS_TEST: -+ return 0; // xxx -+ case PIPE_CAP_TEXTURE_FLOAT_LINEAR: -+ case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: -+ return 1; -+ } -+ -+ /* should only get here on unhandled cases */ -+ debug_printf("Unexpected PIPE_CAP %d query\n", param); -+ return 0; -+} -+ -+static int -+swr_get_shader_param(struct pipe_screen *screen, -+ unsigned shader, -+ enum pipe_shader_cap param) -+{ -+ if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_FRAGMENT) -+ return gallivm_get_shader_param(param); -+ -+ // Todo: geometry, tesselation, compute -+ return 0; -+} -+ -+ -+static float -+swr_get_paramf(struct pipe_screen *screen, enum pipe_capf param) -+{ -+ switch (param) { -+ case PIPE_CAPF_MAX_LINE_WIDTH: -+ case PIPE_CAPF_MAX_LINE_WIDTH_AA: -+ case PIPE_CAPF_MAX_POINT_WIDTH: -+ return 255.0; /* arbitrary */ -+ case PIPE_CAPF_MAX_POINT_WIDTH_AA: -+ return 0.0; -+ case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: -+ return 0.0; -+ case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: -+ return 0.0; -+ case PIPE_CAPF_GUARD_BAND_LEFT: -+ case PIPE_CAPF_GUARD_BAND_TOP: -+ case PIPE_CAPF_GUARD_BAND_RIGHT: -+ case PIPE_CAPF_GUARD_BAND_BOTTOM: -+ return 0.0; -+ } -+ /* should only get here on unhandled cases */ -+ debug_printf("Unexpected PIPE_CAPF %d query\n", param); -+ return 0.0; -+} -+ -+SWR_FORMAT -+mesa_to_swr_format(enum pipe_format format) -+{ -+ const struct util_format_description *format_desc = -+ util_format_description(format); -+ if (!format_desc) -+ return (SWR_FORMAT)-1; -+ -+ // more robust check would be comparing all attributes of the formats -+ // luckily format names are mostly standardized -+ for (int i = 0; i < NUM_SWR_FORMATS; i++) { -+ const SWR_FORMAT_INFO &swr_desc = GetFormatInfo((SWR_FORMAT)i); -+ -+ if (!strcasecmp(format_desc->short_name, swr_desc.name)) -+ return (SWR_FORMAT)i; -+ } -+ -+ // ... with some exceptions -+ switch (format) { -+ case PIPE_FORMAT_R8G8B8A8_SRGB: -+ return R8G8B8A8_UNORM_SRGB; -+ case PIPE_FORMAT_B8G8R8A8_SRGB: -+ return B8G8R8A8_UNORM_SRGB; -+ case PIPE_FORMAT_I8_UNORM: -+ return R8_UNORM; -+ case PIPE_FORMAT_Z24_UNORM_S8_UINT: -+ return R24_UNORM_X8_TYPELESS; -+ case PIPE_FORMAT_L8A8_UNORM: -+ return R8G8_UNORM; -+ default: -+ break; -+ } -+ -+ debug_printf("asked to convert unsupported format %s\n", -+ format_desc->name); -+ return (SWR_FORMAT)-1; -+} -+ -+static boolean -+swr_displaytarget_layout(struct swr_screen *screen, struct swr_resource *res) -+{ -+ struct sw_winsys *winsys = screen->winsys; -+ -+ UINT stride; -+ res->display_target = winsys->displaytarget_create(winsys, -+ res->base.bind, -+ res->base.format, -+ res->alignedWidth, -+ res->alignedHeight, -+ 64, -+ &stride); -+ -+ if (res->display_target == NULL) -+ return FALSE; -+ -+ /* Clear the display target surface */ -+ void *map = winsys->displaytarget_map( -+ winsys, res->display_target, PIPE_TRANSFER_WRITE); -+ -+ if (map) -+ memset(map, 0, res->alignedHeight * stride); -+ -+ winsys->displaytarget_unmap(winsys, res->display_target); -+ -+ return TRUE; -+} -+ -+static struct pipe_resource * -+swr_resource_create(struct pipe_screen *_screen, -+ const struct pipe_resource *templat) -+{ -+ struct swr_screen *screen = swr_screen(_screen); -+ struct swr_resource *res = CALLOC_STRUCT(swr_resource); -+ if (!res) -+ return NULL; -+ -+ res->base = *templat; -+ pipe_reference_init(&res->base.reference, 1); -+ res->base.screen = &screen->base; -+ -+ const struct util_format_description *desc = -+ util_format_description(templat->format); -+ res->has_depth = util_format_has_depth(desc); -+ res->has_stencil = util_format_has_stencil(desc); -+ -+ pipe_format fmt = templat->format; -+ if (res->has_depth) -+ fmt = PIPE_FORMAT_Z24_UNORM_S8_UINT; -+ if (res->has_stencil && !res->has_depth) -+ fmt = PIPE_FORMAT_R8_UINT; -+ -+ res->swr.width = templat->width0; -+ res->swr.height = templat->height0; -+ res->swr.depth = templat->depth0; -+ res->swr.type = SURFACE_2D; -+ res->swr.tileMode = SWR_TILE_NONE; -+ res->swr.format = mesa_to_swr_format(fmt); -+ res->swr.numSamples = (1 << templat->nr_samples); -+ -+ SWR_FORMAT_INFO finfo = GetFormatInfo(res->swr.format); -+ -+ unsigned total_size = 0; -+ unsigned width = templat->width0; -+ unsigned height = templat->height0; -+ unsigned depth = templat->depth0; -+ unsigned layers = templat->array_size; -+ -+ for (int level = 0; level <= templat->last_level; level++) { -+ unsigned alignedWidth, alignedHeight; -+ unsigned num_slices; -+ -+ if (templat->bind & (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET -+ | PIPE_BIND_DISPLAY_TARGET)) { -+ alignedWidth = (width + (KNOB_MACROTILE_X_DIM - 1)) -+ & ~(KNOB_MACROTILE_X_DIM - 1); -+ alignedHeight = (height + (KNOB_MACROTILE_Y_DIM - 1)) -+ & ~(KNOB_MACROTILE_Y_DIM - 1); -+ } else { -+ alignedWidth = width; -+ alignedHeight = height; -+ } -+ -+ if (level == 0) { -+ res->alignedWidth = alignedWidth; -+ res->alignedHeight = alignedHeight; -+ } -+ -+ res->row_stride[level] = alignedWidth * finfo.Bpp; -+ res->img_stride[level] = res->row_stride[level] * alignedHeight; -+ res->mip_offsets[level] = total_size; -+ -+ if (templat->target == PIPE_TEXTURE_3D) -+ num_slices = depth; -+ else if (templat->target == PIPE_TEXTURE_1D_ARRAY -+ || templat->target == PIPE_TEXTURE_2D_ARRAY -+ || templat->target == PIPE_TEXTURE_CUBE -+ || templat->target == PIPE_TEXTURE_CUBE_ARRAY) -+ num_slices = layers; -+ else -+ num_slices = 1; -+ -+ total_size += res->img_stride[level] * num_slices; -+ -+ width = u_minify(width, 1); -+ height = u_minify(height, 1); -+ depth = u_minify(depth, 1); -+ } -+ -+ res->swr.halign = res->alignedWidth; -+ res->swr.valign = res->alignedHeight; -+ res->swr.pitch = res->row_stride[0]; -+ res->swr.pBaseAddress = (BYTE *)_aligned_malloc(total_size, 64); -+ -+ if (res->has_depth && res->has_stencil) { -+ res->secondary.width = templat->width0; -+ res->secondary.height = templat->height0; -+ res->secondary.depth = templat->depth0; -+ res->secondary.type = SURFACE_2D; -+ res->secondary.tileMode = SWR_TILE_NONE; -+ res->secondary.format = R8_UINT; -+ res->secondary.numSamples = (1 << templat->nr_samples); -+ -+ SWR_FORMAT_INFO finfo = GetFormatInfo(res->secondary.format); -+ res->secondary.pitch = res->alignedWidth * finfo.Bpp; -+ res->secondary.pBaseAddress = (BYTE *)_aligned_malloc( -+ res->alignedHeight * res->secondary.pitch, 64); -+ } -+ -+ if (swr_resource_is_texture(&res->base)) { -+ if (res->base.bind & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT -+ | PIPE_BIND_SHARED)) { -+ /* displayable surface */ -+ if (!swr_displaytarget_layout(screen, res)) -+ goto fail; -+ } -+ } -+ -+ return &res->base; -+ -+fail: -+ FREE(res); -+ return NULL; -+} -+ -+static void -+swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt) -+{ -+ struct swr_screen *screen = swr_screen(p_screen); -+ struct swr_resource *res = swr_resource(pt); -+ -+ /* -+ * If this resource is attached to a context it may still be in use, check -+ * dependencies before freeing -+ * XXX TODO: don't use SwrWaitForIdle, use fences and come up with a real -+ * resource manager. -+ * XXX It's happened that we get a swr_destroy prior to freeing the -+ * framebuffer resource. Don't wait on it. -+ */ -+ if (res->bound_to_context && !res->display_target) { -+ struct swr_context *ctx = -+ swr_context((pipe_context *)res->bound_to_context); -+ SwrWaitForIdle( -+ ctx->swrContext); // BMCDEBUG, don't SwrWaitForIdle!!! Use a fence. -+ } -+ -+ if (res->display_target) { -+ /* display target */ -+ struct sw_winsys *winsys = screen->winsys; -+ winsys->displaytarget_destroy(winsys, res->display_target); -+ } -+ -+ _aligned_free(res->swr.pBaseAddress); -+ _aligned_free(res->secondary.pBaseAddress); -+ -+ FREE(res); -+} -+ -+ -+static void -+swr_flush_frontbuffer(struct pipe_screen *p_screen, -+ struct pipe_resource *resource, -+ unsigned level, -+ unsigned layer, -+ void *context_private, -+ struct pipe_box *sub_box) -+{ -+ SWR_SURFACE_STATE &colorBuffer = swr_resource(resource)->swr; -+ -+ struct swr_screen *screen = swr_screen(p_screen); -+ struct sw_winsys *winsys = screen->winsys; -+ struct swr_resource *res = swr_resource(resource); -+ -+ /* Ensure fence set at flush is finished, before reading frame buffer */ -+ swr_fence_finish(p_screen, screen->flush_fence, 0); -+ -+ void *map = winsys->displaytarget_map( -+ winsys, res->display_target, PIPE_TRANSFER_WRITE); -+ memcpy( -+ map, colorBuffer.pBaseAddress, colorBuffer.pitch * colorBuffer.height); -+ winsys->displaytarget_unmap(winsys, res->display_target); -+ -+ assert(res->display_target); -+ if (res->display_target) -+ winsys->displaytarget_display( -+ winsys, res->display_target, context_private, sub_box); -+} -+ -+ -+static void -+swr_destroy_screen(struct pipe_screen *p_screen) -+{ -+ struct swr_screen *screen = swr_screen(p_screen); -+ struct sw_winsys *winsys = screen->winsys; -+ -+ fprintf(stderr, "SWR destroy screen!\n"); -+ -+ swr_fence_finish(p_screen, screen->flush_fence, 0); -+ swr_fence_reference(p_screen, &screen->flush_fence, NULL); -+ -+ JitDestroyContext(screen->hJitMgr); -+ -+ if (winsys->destroy) -+ winsys->destroy(winsys); -+ -+ FREE(screen); -+} -+ -+ -+struct pipe_screen * -+swr_create_screen(struct sw_winsys *winsys) -+{ -+ struct swr_screen *screen = CALLOC_STRUCT(swr_screen); -+ -+ if (!screen) -+ return NULL; -+ -+ fprintf(stderr, "SWR create screen!\n"); -+ util_cpu_detect(); -+ if (util_cpu_caps.has_avx2) -+ fprintf(stderr, "This processor supports AVX2.\n"); -+ else if (util_cpu_caps.has_avx) -+ fprintf(stderr, "This processor supports AVX.\n"); -+ /* Exit gracefully if there is no AVX support */ -+ else { -+ fprintf(stderr, " !!! This processor does not support AVX or AVX2. " -+ "OpenSWR requires AVX.\n"); -+ exit(-1); -+ } -+ -+ if (!getenv("KNOB_MAX_PRIMS_PER_DRAW")) { -+ g_GlobalKnobs.MAX_PRIMS_PER_DRAW.Value(49152); -+ } -+ -+ screen->winsys = winsys; -+ screen->base.get_name = swr_get_name; -+ screen->base.get_vendor = swr_get_vendor; -+ screen->base.is_format_supported = swr_is_format_supported; -+ screen->base.context_create = swr_create_context; -+ -+ screen->base.destroy = swr_destroy_screen; -+ screen->base.get_param = swr_get_param; -+ screen->base.get_shader_param = swr_get_shader_param; -+ screen->base.get_paramf = swr_get_paramf; -+ -+ screen->base.resource_create = swr_resource_create; -+ screen->base.resource_destroy = swr_resource_destroy; -+ -+ screen->base.flush_frontbuffer = swr_flush_frontbuffer; -+ -+ screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, KNOB_ARCH_STR); -+ -+ swr_fence_init(&screen->base); -+ -+ return &screen->base; -+} -diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h -new file mode 100644 -index 0000000..a96dc44 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_screen.h -@@ -0,0 +1,52 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#ifndef SWR_SCREEN_H -+#define SWR_SCREEN_H -+ -+#include "pipe/p_screen.h" -+#include "pipe/p_defines.h" -+#include "api.h" -+ -+struct sw_winsys; -+ -+struct swr_screen { -+ struct pipe_screen base; -+ -+ struct pipe_fence_handle *flush_fence; -+ -+ struct sw_winsys *winsys; -+ -+ HANDLE hJitMgr; -+}; -+ -+static INLINE struct swr_screen * -+swr_screen(struct pipe_screen *pipe) -+{ -+ return (struct swr_screen *)pipe; -+} -+ -+SWR_FORMAT -+mesa_to_swr_format(enum pipe_format format); -+ -+#endif -diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp -new file mode 100644 -index 0000000..edad4c2 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_shader.cpp -@@ -0,0 +1,608 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#include "JitManager.h" -+#include "state.h" -+#include "state_llvm.h" -+#include "builder.h" -+ -+#include "llvm-c/Core.h" -+#include "llvm/Support/CBindingWrapping.h" -+ -+#include "tgsi/tgsi_strings.h" -+#include "gallivm/lp_bld_init.h" -+#include "gallivm/lp_bld_flow.h" -+#include "gallivm/lp_bld_struct.h" -+#include "gallivm/lp_bld_tgsi.h" -+ -+#include "swr_context.h" -+#include "swr_context_llvm.h" -+#include "swr_state.h" -+#include "swr_screen.h" -+ -+bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs) -+{ -+ return !memcmp(&lhs, &rhs, sizeof(lhs)); -+} -+ -+void -+swr_generate_fs_key(struct swr_jit_key &key, -+ struct swr_context *ctx, -+ swr_fragment_shader *swr_fs) -+{ -+ key.nr_cbufs = ctx->framebuffer.nr_cbufs; -+ key.light_twoside = ctx->rasterizer->light_twoside; -+ memcpy(&key.vs_output_semantic_name, -+ &ctx->vs->info.base.output_semantic_name, -+ sizeof(key.vs_output_semantic_name)); -+ memcpy(&key.vs_output_semantic_idx, -+ &ctx->vs->info.base.output_semantic_index, -+ sizeof(key.vs_output_semantic_idx)); -+ -+ key.nr_samplers = swr_fs->info.base.file_max[TGSI_FILE_SAMPLER] + 1; -+ -+ for (unsigned i = 0; i < key.nr_samplers; i++) { -+ if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { -+ lp_sampler_static_sampler_state( -+ &key.sampler[i].sampler_state, -+ ctx->samplers[PIPE_SHADER_FRAGMENT][i]); -+ } -+ } -+ -+ /* -+ * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes -+ * are dx10-style? Can't really have mixed opcodes, at least not -+ * if we want to skip the holes here (without rescanning tgsi). -+ */ -+ if (swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { -+ key.nr_sampler_views = -+ swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; -+ for (unsigned i = 0; i < key.nr_sampler_views; i++) { -+ if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { -+ lp_sampler_static_texture_state( -+ &key.sampler[i].texture_state, -+ ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]); -+ } -+ } -+ } else { -+ key.nr_sampler_views = key.nr_samplers; -+ for (unsigned i = 0; i < key.nr_sampler_views; i++) { -+ if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { -+ lp_sampler_static_texture_state( -+ &key.sampler[i].texture_state, -+ ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]); -+ } -+ } -+ } -+ -+ memcpy(&key.alphaTest, -+ &ctx->depth_stencil->alpha, -+ sizeof(struct pipe_alpha_state)); -+} -+ -+struct BuilderSWR : public Builder { -+ BuilderSWR(JitManager *pJitMgr) -+ : Builder(pJitMgr) -+ { -+ pJitMgr->SetupNewModule(); -+ } -+ -+ PFN_VERTEX_FUNC -+ CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs); -+ PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_key &key); -+}; -+ -+PFN_VERTEX_FUNC -+BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs) -+{ -+ swr_vs->linkageMask = 0; -+ -+ for (unsigned i = 0; i < swr_vs->info.base.num_outputs; i++) { -+ switch (swr_vs->info.base.output_semantic_name[i]) { -+ case TGSI_SEMANTIC_POSITION: -+ break; -+ case TGSI_SEMANTIC_PSIZE: -+ swr_vs->pointSizeAttrib = i; -+ break; -+ default: -+ swr_vs->linkageMask |= (1 << i); -+ break; -+ } -+ } -+ -+ // tgsi_dump(swr_vs->pipe.tokens, 0); -+ -+ struct gallivm_state *gallivm = -+ gallivm_create("VS", wrap(&JM()->mContext)); -+ gallivm->module = wrap(JM()->mpCurrentModule); -+ -+ LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; -+ LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; -+ -+ memset(outputs, 0, sizeof(outputs)); -+ -+ AttrBuilder attrBuilder; -+ attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); -+ AttributeSet attrSet = AttributeSet::get( -+ JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); -+ -+ std::vector vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), -+ PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)}; -+ FunctionType *vsFuncType = -+ FunctionType::get(Type::getVoidTy(JM()->mContext), vsArgs, false); -+ -+ // create new vertex shader function -+ auto pFunction = Function::Create(vsFuncType, -+ GlobalValue::ExternalLinkage, -+ "VS", -+ JM()->mpCurrentModule); -+ pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); -+ -+ BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); -+ IRB()->SetInsertPoint(block); -+ LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); -+ -+ auto argitr = pFunction->getArgumentList().begin(); -+ Value *hPrivateData = argitr++; -+ hPrivateData->setName("hPrivateData"); -+ Value *pVsCtx = argitr++; -+ pVsCtx->setName("vsCtx"); -+ -+ Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantVS}); -+ consts_ptr->setName("vs_constants"); -+ Value *const_sizes_ptr = -+ GEP(hPrivateData, {0, swr_draw_context_num_constantsVS}); -+ const_sizes_ptr->setName("num_vs_constants"); -+ -+ Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin}); -+ -+ for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { -+ const unsigned mask = swr_vs->info.base.input_usage_mask[attrib]; -+ for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { -+ if (mask & (1 << channel)) { -+ inputs[attrib][channel] = -+ wrap(LOAD(vtxInput, {0, 0, attrib, channel})); -+ } -+ } -+ } -+ -+ struct lp_bld_tgsi_system_values system_values; -+ memset(&system_values, 0, sizeof(system_values)); -+ system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID})); -+ system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID})); -+ -+ lp_build_tgsi_soa(gallivm, -+ swr_vs->pipe.tokens, -+ lp_type_float_vec(32, 32 * 8), -+ NULL, // mask -+ wrap(consts_ptr), -+ wrap(const_sizes_ptr), -+ &system_values, -+ inputs, -+ outputs, -+ NULL, // wrap(hPrivateData), (sampler context) -+ NULL, // sampler -+ &swr_vs->info.base, -+ NULL); // geometry shader face -+ -+ IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); -+ -+ Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout}); -+ -+ for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { -+ for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) { -+ if (!outputs[attrib][channel]) -+ continue; -+ -+ Value *val = LOAD(unwrap(outputs[attrib][channel])); -+ STORE(val, vtxOutput, {0, 0, attrib, channel}); -+ } -+ } -+ -+ RET_VOID(); -+ -+ gallivm_verify_function(gallivm, wrap(pFunction)); -+ gallivm_compile_module(gallivm); -+ -+ // lp_debug_dump_value(func); -+ -+ PFN_VERTEX_FUNC pFunc = -+ (PFN_VERTEX_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); -+ -+ debug_printf("vert shader %p\n", pFunc); -+ assert(pFunc && "Error: VertShader = NULL"); -+ -+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5) -+ JM()->mIsModuleFinalized = true; -+#endif -+ -+ return pFunc; -+} -+ -+PFN_VERTEX_FUNC -+swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs) -+{ -+ BuilderSWR builder( -+ reinterpret_cast(swr_screen(ctx->screen)->hJitMgr)); -+ return builder.CompileVS(ctx, swr_vs); -+} -+ -+static unsigned -+locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info) -+{ -+ for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { -+ if ((info->output_semantic_name[i] == name) -+ && (info->output_semantic_index[i] == index)) { -+ return i - 1; // position is not part of the linkage -+ } -+ } -+ -+ if (name == TGSI_SEMANTIC_COLOR) { // BCOLOR fallback -+ for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { -+ if ((info->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) -+ && (info->output_semantic_index[i] == index)) { -+ return i - 1; // position is not part of the linkage -+ } -+ } -+ } -+ -+ return 0xFFFFFFFF; -+} -+ -+PFN_PIXEL_KERNEL -+BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key) -+{ -+ struct swr_fragment_shader *swr_fs = ctx->fs; -+ -+ // tgsi_dump(swr_fs->pipe.tokens, 0); -+ -+ struct gallivm_state *gallivm = -+ gallivm_create("FS", wrap(&JM()->mContext)); -+ gallivm->module = wrap(JM()->mpCurrentModule); -+ -+ LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; -+ LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; -+ -+ memset(inputs, 0, sizeof(inputs)); -+ memset(outputs, 0, sizeof(outputs)); -+ -+ struct lp_build_sampler_soa *sampler = NULL; -+ -+ AttrBuilder attrBuilder; -+ attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); -+ AttributeSet attrSet = AttributeSet::get( -+ JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); -+ -+ std::vector fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), -+ PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)}; -+ FunctionType *funcType = -+ FunctionType::get(Type::getVoidTy(JM()->mContext), fsArgs, false); -+ -+ auto pFunction = Function::Create(funcType, -+ GlobalValue::ExternalLinkage, -+ "FS", -+ JM()->mpCurrentModule); -+ pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); -+ -+ BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); -+ IRB()->SetInsertPoint(block); -+ LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); -+ -+ auto &args = pFunction->getArgumentList(); -+ Value *hPrivateData = args.begin(); -+ hPrivateData->setName("hPrivateData"); -+ Value *pPS = ++args.begin(); -+ pPS->setName("psCtx"); -+ -+ Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantFS}); -+ consts_ptr->setName("fs_constants"); -+ Value *const_sizes_ptr = -+ GEP(hPrivateData, {0, swr_draw_context_num_constantsFS}); -+ const_sizes_ptr->setName("num_fs_constants"); -+ -+ // xxx should check for flat shading versus interpolation -+ -+ // load i -+ Value *vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI}, "i"); -+ -+ // load j -+ Value *vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ}, "j"); -+ -+ // load/compute w -+ Value *vw = FDIV(VIMMED1(1.0f), LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW})); -+ vw->setName("w"); -+ -+ // load *pAttribs, *pPerspAttribs -+ Value *pAttribs = LOAD(pPS, {0, SWR_PS_CONTEXT_pAttribs}, "pAttribs"); -+ Value *pPerspAttribs = -+ LOAD(pPS, {0, SWR_PS_CONTEXT_pPerspAttribs}, "pPerspAttribs"); -+ -+ swr_fs->constantMask = 0; -+ -+ for (int attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { -+ const unsigned mask = swr_fs->info.base.input_usage_mask[attrib]; -+ const unsigned interpMode = swr_fs->info.base.input_interpolate[attrib]; -+ -+ if (!mask) -+ continue; -+ -+ ubyte semantic_name = swr_fs->info.base.input_semantic_name[attrib]; -+ ubyte semantic_idx = swr_fs->info.base.input_semantic_index[attrib]; -+ -+ if (semantic_name == TGSI_SEMANTIC_FACE) { -+ Value *ff = -+ UI_TO_FP(LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), mFP32Ty); -+ ff = FSUB(FMUL(ff, C(2.0f)), C(1.0f)); -+ ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vFrontFace"); -+ -+ inputs[attrib][0] = wrap(ff); -+ inputs[attrib][1] = wrap(VIMMED1(0.0f)); -+ inputs[attrib][2] = wrap(VIMMED1(0.0f)); -+ inputs[attrib][3] = wrap(VIMMED1(1.0f)); -+ continue; -+ } else if (semantic_name == TGSI_SEMANTIC_POSITION) { // gl_FragCoord -+ inputs[attrib][0] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vX}, "vX")); -+ inputs[attrib][1] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vY}, "vY")); -+ inputs[attrib][2] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vZ}, "vZ")); -+ inputs[attrib][3] = -+ wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW}, "vOneOverW")); -+ continue; -+ } else if (semantic_name == TGSI_SEMANTIC_PRIMID) { -+ Value *primID = LOAD(pPS, {0, SWR_PS_CONTEXT_primID}, "primID"); -+ inputs[attrib][0] = wrap(VECTOR_SPLAT(JM()->mVWidth, primID)); -+ inputs[attrib][1] = wrap(VIMMED1(0)); -+ inputs[attrib][2] = wrap(VIMMED1(0)); -+ inputs[attrib][3] = wrap(VIMMED1(0)); -+ continue; -+ } -+ -+ unsigned linkedAttrib = -+ locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base); -+ if (linkedAttrib == 0xFFFFFFFF) { -+ // not found - check for point sprite -+ if (ctx->rasterizer->sprite_coord_enable) { -+ linkedAttrib = ctx->vs->info.base.num_outputs - 1; -+ } else { -+ fprintf(stderr, -+ "Missing %s[%d]\n", -+ tgsi_semantic_names[semantic_name], -+ semantic_idx); -+ assert(0 && "attribute linkage not found"); -+ } -+ } -+ -+ if (interpMode == TGSI_INTERPOLATE_CONSTANT) { -+ swr_fs->constantMask |= 1 << linkedAttrib; -+ } -+ -+ for (int channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { -+ if (mask & (1 << channel)) { -+ Value *indexA = C(linkedAttrib * 12 + channel); -+ Value *indexB = C(linkedAttrib * 12 + channel + 4); -+ Value *indexC = C(linkedAttrib * 12 + channel + 8); -+ -+ if ((semantic_name == TGSI_SEMANTIC_COLOR) -+ && ctx->rasterizer->light_twoside) { -+ unsigned bcolorAttrib = locate_linkage( -+ TGSI_SEMANTIC_BCOLOR, semantic_idx, &ctx->vs->info.base); -+ -+ unsigned diff = 12 * (bcolorAttrib - linkedAttrib); -+ -+ Value *back = -+ XOR(C(1), LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), "backFace"); -+ -+ Value *offset = MUL(back, C(diff)); -+ offset->setName("offset"); -+ -+ indexA = ADD(indexA, offset); -+ indexB = ADD(indexB, offset); -+ indexC = ADD(indexC, offset); -+ -+ if (interpMode == TGSI_INTERPOLATE_CONSTANT) { -+ swr_fs->constantMask |= 1 << bcolorAttrib; -+ } -+ } -+ -+ Value *pAttribPtr = (interpMode == TGSI_INTERPOLATE_PERSPECTIVE) -+ ? pPerspAttribs -+ : pAttribs; -+ -+ Value *va = -+ VECTOR_SPLAT(JM()->mVWidth, LOAD(GEP(pAttribPtr, indexA))); -+ Value *vb = -+ VECTOR_SPLAT(JM()->mVWidth, LOAD(GEP(pAttribPtr, indexB))); -+ Value *vc = -+ VECTOR_SPLAT(JM()->mVWidth, LOAD(GEP(pAttribPtr, indexC))); -+ -+ if (interpMode == TGSI_INTERPOLATE_CONSTANT) { -+ inputs[attrib][channel] = wrap(va); -+ } else { -+ Value *vk = FSUB(FSUB(VIMMED1(1.0f), vi), vj); -+ -+ vc = FMUL(vk, vc); -+ -+ Value *interp = FMUL(va, vi); -+ Value *interp1 = FMUL(vb, vj); -+ interp = FADD(interp, interp1); -+ interp = FADD(interp, vc); -+ if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE) -+ interp = FMUL(interp, vw); -+ inputs[attrib][channel] = wrap(interp); -+ } -+ } -+ } -+ } -+ -+ sampler = swr_sampler_soa_create(key.sampler); -+ -+ struct lp_bld_tgsi_system_values system_values; -+ memset(&system_values, 0, sizeof(system_values)); -+ -+ struct lp_build_mask_context mask; -+ -+ if (swr_fs->info.base.uses_kill || key.alphaTest.enabled) { -+ Value *mask_val = LOAD(pPS, {0, SWR_PS_CONTEXT_mask}, "coverage_mask"); -+ lp_build_mask_begin( -+ &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(mask_val)); -+ } -+ -+ lp_build_tgsi_soa(gallivm, -+ swr_fs->pipe.tokens, -+ lp_type_float_vec(32, 32 * 8), -+ swr_fs->info.base.uses_kill ? &mask : NULL, // mask -+ wrap(consts_ptr), -+ wrap(const_sizes_ptr), -+ &system_values, -+ inputs, -+ outputs, -+ wrap(hPrivateData), -+ sampler, // sampler -+ &swr_fs->info.base, -+ NULL); // geometry shader face -+ -+ IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); -+ -+ for (uint32_t attrib = 0; attrib < swr_fs->info.base.num_outputs; -+ attrib++) { -+ switch (swr_fs->info.base.output_semantic_name[attrib]) { -+ case TGSI_SEMANTIC_POSITION: { -+ // write z -+ LLVMValueRef outZ = -+ LLVMBuildLoad(gallivm->builder, outputs[attrib][2], ""); -+ STORE(unwrap(outZ), pPS, {0, SWR_PS_CONTEXT_vZ}); -+ break; -+ } -+ case TGSI_SEMANTIC_COLOR: { -+ for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { -+ if (!outputs[attrib][channel]) -+ continue; -+ -+ LLVMValueRef out = -+ LLVMBuildLoad(gallivm->builder, outputs[attrib][channel], ""); -+ if (swr_fs->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) { -+ for (uint32_t rt = 0; rt < key.nr_cbufs; rt++) { -+ STORE(unwrap(out), -+ pPS, -+ {0, SWR_PS_CONTEXT_shaded, rt, channel}); -+ } -+ } else { -+ STORE(unwrap(out), -+ pPS, -+ {0, -+ SWR_PS_CONTEXT_shaded, -+ swr_fs->info.base.output_semantic_index[attrib], -+ channel}); -+ } -+ } -+ break; -+ } -+ default: { -+ fprintf(stderr, -+ "unknown output from FS %s[%d]\n", -+ tgsi_semantic_names[swr_fs->info.base -+ .output_semantic_name[attrib]], -+ swr_fs->info.base.output_semantic_index[attrib]); -+ break; -+ } -+ } -+ } -+ -+ LLVMValueRef mask_result = 0; -+ if (swr_fs->info.base.uses_kill || key.alphaTest.enabled) { -+ mask_result = lp_build_mask_end(&mask); -+ } -+ -+ IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); -+ -+ if (key.alphaTest.enabled) { -+ unsigned linkage = -+ locate_linkage(TGSI_SEMANTIC_COLOR, 0, &ctx->fs->info.base) + 1; -+ -+ Value *alpha = LOAD( -+ pPS, {0, SWR_PS_CONTEXT_shaded, linkage, 3 /* alpha */}, "alpha"); -+ Value *ref = VIMMED1(key.alphaTest.ref_value); -+ -+ CmpInst::Predicate cmp = CmpInst::Predicate::FCMP_FALSE; -+ switch (key.alphaTest.func) { -+ case PIPE_FUNC_NEVER: -+ cmp = CmpInst::Predicate::FCMP_FALSE; -+ break; -+ case PIPE_FUNC_LESS: -+ cmp = CmpInst::Predicate::FCMP_OLT; -+ break; -+ case PIPE_FUNC_EQUAL: -+ cmp = CmpInst::Predicate::FCMP_OEQ; -+ break; -+ case PIPE_FUNC_LEQUAL: -+ cmp = CmpInst::Predicate::FCMP_OLE; -+ break; -+ case PIPE_FUNC_GREATER: -+ cmp = CmpInst::Predicate::FCMP_OGT; -+ break; -+ case PIPE_FUNC_NOTEQUAL: -+ cmp = CmpInst::Predicate::FCMP_ONE; -+ break; -+ case PIPE_FUNC_GEQUAL: -+ cmp = CmpInst::Predicate::FCMP_OGE; -+ break; -+ case PIPE_FUNC_ALWAYS: -+ cmp = CmpInst::Predicate::FCMP_TRUE; -+ break; -+ } -+ -+ Value *alpha_result = -+ IRB()->CreateFCmp(cmp, alpha, ref, "alphaTestFunc"); -+ -+ mask_result = -+ wrap(AND(unwrap(mask_result), S_EXT(alpha_result, mSimdInt32Ty))); -+ } -+ -+ if (swr_fs->info.base.uses_kill || key.alphaTest.enabled) { -+ STORE(unwrap(mask_result), pPS, {0, SWR_PS_CONTEXT_mask}); -+ } -+ -+ RET_VOID(); -+ -+ gallivm_verify_function(gallivm, wrap(pFunction)); -+ -+ gallivm_compile_module(gallivm); -+ -+ PFN_PIXEL_KERNEL kernel = -+ (PFN_PIXEL_KERNEL)gallivm_jit_function(gallivm, wrap(pFunction)); -+ debug_printf("frag shader %p\n", kernel); -+ assert(kernel && "Error: FragShader = NULL"); -+ -+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5) -+ JM()->mIsModuleFinalized = true; -+#endif -+ -+ return kernel; -+} -+ -+PFN_PIXEL_KERNEL -+swr_compile_fs(struct swr_context *ctx, swr_jit_key &key) -+{ -+ BuilderSWR builder( -+ reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr)); -+ return builder.CompileFS(ctx, key); -+} -diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h -new file mode 100644 -index 0000000..2962646 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_shader.h -@@ -0,0 +1,61 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#pragma once -+ -+class swr_vertex_shader; -+class swr_fragment_shader; -+class swr_jit_key; -+ -+PFN_VERTEX_FUNC -+swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs); -+ -+PFN_PIXEL_KERNEL -+swr_compile_fs(struct swr_context *ctx, swr_jit_key &key); -+ -+void swr_generate_fs_key(struct swr_jit_key &key, -+ struct swr_context *ctx, -+ swr_fragment_shader *swr_fs); -+ -+struct swr_jit_key { -+ unsigned nr_cbufs; -+ unsigned light_twoside; -+ ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; -+ ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; -+ unsigned nr_samplers; -+ unsigned nr_sampler_views; -+ struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS]; -+ struct pipe_alpha_state alphaTest; -+}; -+ -+namespace std -+{ -+template <> struct hash { -+ std::size_t operator()(const swr_jit_key &k) const -+ { -+ return util_hash_crc32(&k, sizeof(k)); -+ } -+}; -+}; -+ -+bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs); -diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp -new file mode 100644 -index 0000000..fa16844 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_state.cpp -@@ -0,0 +1,1344 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#include "common/os.h" -+#include "jit_api.h" -+#include "JitManager.h" -+#include "state_llvm.h" -+ -+#include "gallivm/lp_bld_tgsi.h" -+#include "util/u_format.h" -+ -+#include "util/u_memory.h" -+#include "util/u_inlines.h" -+#include "util/u_helpers.h" -+#include "util/u_framebuffer.h" -+ -+#include "swr_state.h" -+#include "swr_context.h" -+#include "swr_context_llvm.h" -+#include "swr_screen.h" -+#include "swr_resource.h" -+#include "swr_tex_sample.h" -+#include "swr_scratch.h" -+#include "swr_shader.h" -+ -+/* These should be pulled out into separate files as necessary -+ * Just initializing everything here to get going. */ -+ -+static void * -+swr_create_blend_state(struct pipe_context *pipe, -+ const struct pipe_blend_state *blend) -+{ -+ struct swr_blend_state *state = CALLOC_STRUCT(swr_blend_state); -+ -+ memcpy(&state->pipe, blend, sizeof(*blend)); -+ -+ struct pipe_blend_state *pipe_blend = &state->pipe; -+ -+ for (int target = 0; -+ target < std::min(SWR_NUM_RENDERTARGETS, PIPE_MAX_COLOR_BUFS); -+ target++) { -+ state->compileState[target].independentAlphaBlendEnable = -+ pipe_blend->independent_blend_enable; -+ -+ struct pipe_rt_blend_state *rt_blend = &pipe_blend->rt[target]; -+ SWR_RENDER_TARGET_BLEND_STATE &targetState = -+ state->compileState[target].blendState; -+ -+ if (target != 0 && !pipe_blend->independent_blend_enable) { -+ memcpy(&targetState, &state->compileState[0].blendState, sizeof(SWR_RENDER_TARGET_BLEND_STATE)); -+ continue; -+ } -+ -+ targetState.colorBlendEnable = rt_blend->blend_enable; -+ if (targetState.colorBlendEnable) { -+ targetState.sourceAlphaBlendFactor = -+ swr_convert_blend_factor(rt_blend->alpha_src_factor); -+ targetState.destAlphaBlendFactor = -+ swr_convert_blend_factor(rt_blend->alpha_dst_factor); -+ targetState.sourceBlendFactor = -+ swr_convert_blend_factor(rt_blend->rgb_src_factor); -+ targetState.destBlendFactor = -+ swr_convert_blend_factor(rt_blend->rgb_dst_factor); -+ -+ targetState.colorBlendFunc = -+ swr_convert_blend_func(rt_blend->rgb_func); -+ targetState.alphaBlendFunc = -+ swr_convert_blend_func(rt_blend->alpha_func); -+ } -+ -+ targetState.writeDisableRed = -+ (rt_blend->colormask & PIPE_MASK_R) ? 0 : 1; -+ targetState.writeDisableGreen = -+ (rt_blend->colormask & PIPE_MASK_G) ? 0 : 1; -+ targetState.writeDisableBlue = -+ (rt_blend->colormask & PIPE_MASK_B) ? 0 : 1; -+ targetState.writeDisableAlpha = -+ (rt_blend->colormask & PIPE_MASK_A) ? 0 : 1; -+ } -+ -+ return state; -+} -+ -+static void -+swr_bind_blend_state(struct pipe_context *pipe, void *blend) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ if (ctx->blend == blend) -+ return; -+ -+ ctx->blend = (swr_blend_state *)blend; -+ -+ ctx->dirty |= SWR_NEW_BLEND; -+} -+ -+static void -+swr_delete_blend_state(struct pipe_context *pipe, void *blend) -+{ -+ FREE(blend); -+} -+ -+static void -+swr_set_blend_color(struct pipe_context *pipe, -+ const struct pipe_blend_color *color) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ ctx->blend_color = *color; -+ -+ ctx->dirty |= SWR_NEW_BLEND; -+} -+ -+static void -+swr_set_stencil_ref(struct pipe_context *pipe, -+ const struct pipe_stencil_ref *ref) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ ctx->stencil_ref = *ref; -+ -+ ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA; -+} -+ -+static void * -+swr_create_depth_stencil_state( -+ struct pipe_context *pipe, -+ const struct pipe_depth_stencil_alpha_state *depth_stencil) -+{ -+ struct pipe_depth_stencil_alpha_state *state; -+ -+ state = (pipe_depth_stencil_alpha_state *)mem_dup(depth_stencil, -+ sizeof *depth_stencil); -+ -+ return state; -+} -+ -+static void -+swr_bind_depth_stencil_state(struct pipe_context *pipe, void *depth_stencil) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ if (ctx->depth_stencil == (pipe_depth_stencil_alpha_state *)depth_stencil) -+ return; -+ -+ ctx->depth_stencil = (pipe_depth_stencil_alpha_state *)depth_stencil; -+ -+ ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA; -+} -+ -+static void -+swr_delete_depth_stencil_state(struct pipe_context *pipe, void *depth) -+{ -+ FREE(depth); -+} -+ -+ -+static void * -+swr_create_rasterizer_state(struct pipe_context *pipe, -+ const struct pipe_rasterizer_state *rast) -+{ -+ struct pipe_rasterizer_state *state; -+ state = (pipe_rasterizer_state *)mem_dup(rast, sizeof *rast); -+ -+ return state; -+} -+ -+static void -+swr_bind_rasterizer_state(struct pipe_context *pipe, void *handle) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ const struct pipe_rasterizer_state *rasterizer = -+ (const struct pipe_rasterizer_state *)handle; -+ -+ if (ctx->rasterizer == (pipe_rasterizer_state *)rasterizer) -+ return; -+ -+ ctx->rasterizer = (pipe_rasterizer_state *)rasterizer; -+ -+ ctx->dirty |= SWR_NEW_RASTERIZER; -+} -+ -+static void -+swr_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer) -+{ -+ FREE(rasterizer); -+} -+ -+ -+static void * -+swr_create_sampler_state(struct pipe_context *pipe, -+ const struct pipe_sampler_state *sampler) -+{ -+ struct pipe_sampler_state *state = -+ (pipe_sampler_state *)mem_dup(sampler, sizeof *sampler); -+ -+ return state; -+} -+ -+static void -+swr_bind_sampler_states(struct pipe_context *pipe, -+ unsigned shader, -+ unsigned start, -+ unsigned num, -+ void **samplers) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ unsigned i; -+ -+ assert(shader < PIPE_SHADER_TYPES); -+ assert(start + num <= Elements(ctx->samplers[shader])); -+ -+ /* set the new samplers */ -+ ctx->num_samplers[shader] = num; -+ for (i = 0; i < num; i++) { -+ ctx->samplers[shader][start + i] = (pipe_sampler_state *)samplers[i]; -+ } -+ -+ ctx->dirty |= SWR_NEW_SAMPLER; -+} -+ -+static void -+swr_delete_sampler_state(struct pipe_context *pipe, void *sampler) -+{ -+ FREE(sampler); -+} -+ -+ -+static struct pipe_sampler_view * -+swr_create_sampler_view(struct pipe_context *pipe, -+ struct pipe_resource *texture, -+ const struct pipe_sampler_view *templ) -+{ -+ struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view); -+ -+ if (view) { -+ *view = *templ; -+ view->reference.count = 1; -+ view->texture = NULL; -+ pipe_resource_reference(&view->texture, texture); -+ view->context = pipe; -+ } -+ -+ return view; -+} -+ -+static void -+swr_set_sampler_views(struct pipe_context *pipe, -+ unsigned shader, -+ unsigned start, -+ unsigned num, -+ struct pipe_sampler_view **views) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ uint i; -+ -+ assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS); -+ -+ assert(shader < PIPE_SHADER_TYPES); -+ assert(start + num <= Elements(ctx->sampler_views[shader])); -+ -+ /* set the new sampler views */ -+ ctx->num_sampler_views[shader] = num; -+ for (i = 0; i < num; i++) { -+ /* Note: we're using pipe_sampler_view_release() here to work around -+ * a possible crash when the old view belongs to another context that -+ * was already destroyed. -+ */ -+ pipe_sampler_view_release(pipe, &ctx->sampler_views[shader][start + i]); -+ pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i], -+ views[i]); -+ } -+ -+ ctx->dirty |= SWR_NEW_SAMPLER_VIEW; -+} -+ -+static void -+swr_sampler_view_destroy(struct pipe_context *pipe, -+ struct pipe_sampler_view *view) -+{ -+ pipe_resource_reference(&view->texture, NULL); -+ FREE(view); -+} -+ -+static void * -+swr_create_vs_state(struct pipe_context *pipe, -+ const struct pipe_shader_state *vs) -+{ -+ struct swr_vertex_shader *swr_vs = -+ (swr_vertex_shader *)CALLOC_STRUCT(swr_vertex_shader); -+ if (!swr_vs) -+ return NULL; -+ -+ swr_vs->pipe.tokens = tgsi_dup_tokens(vs->tokens); -+ swr_vs->pipe.stream_output = vs->stream_output; -+ -+ lp_build_tgsi_info(vs->tokens, &swr_vs->info); -+ -+ swr_vs->func = swr_compile_vs(pipe, swr_vs); -+ -+ swr_vs->soState = {0}; -+ -+ if (swr_vs->pipe.stream_output.num_outputs) { -+ pipe_stream_output_info *stream_output = &swr_vs->pipe.stream_output; -+ -+ swr_vs->soState.soEnable = true; -+ // soState.rasterizerDisable set on state dirty -+ // soState.streamToRasterizer not used -+ -+ for (uint32_t i = 0; i < stream_output->num_outputs; i++) { -+ swr_vs->soState.streamMasks[stream_output->output[i].stream] |= -+ 1 << (stream_output->output[i].register_index - 1); -+ } -+ for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) { -+ swr_vs->soState.streamNumEntries[i] = -+ _mm_popcnt_u32(swr_vs->soState.streamMasks[i]); -+ } -+ } -+ -+ return swr_vs; -+} -+ -+static void -+swr_bind_vs_state(struct pipe_context *pipe, void *vs) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ if (ctx->vs == vs) -+ return; -+ -+ ctx->vs = (swr_vertex_shader *)vs; -+ ctx->dirty |= SWR_NEW_VS; -+} -+ -+static void -+swr_delete_vs_state(struct pipe_context *pipe, void *vs) -+{ -+ struct swr_vertex_shader *swr_vs = (swr_vertex_shader *)vs; -+ FREE((void *)swr_vs->pipe.tokens); -+ FREE(vs); -+} -+ -+static void * -+swr_create_fs_state(struct pipe_context *pipe, -+ const struct pipe_shader_state *fs) -+{ -+ struct swr_fragment_shader *swr_fs = new swr_fragment_shader; -+ if (!swr_fs) -+ return NULL; -+ -+ swr_fs->pipe.tokens = tgsi_dup_tokens(fs->tokens); -+ -+ lp_build_tgsi_info(fs->tokens, &swr_fs->info); -+ -+ return swr_fs; -+} -+ -+ -+static void -+swr_bind_fs_state(struct pipe_context *pipe, void *fs) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ if (ctx->fs == fs) -+ return; -+ -+ ctx->fs = (swr_fragment_shader *)fs; -+ ctx->dirty |= SWR_NEW_FS; -+} -+ -+static void -+swr_delete_fs_state(struct pipe_context *pipe, void *fs) -+{ -+ struct swr_fragment_shader *swr_fs = (swr_fragment_shader *)fs; -+ FREE((void *)swr_fs->pipe.tokens); -+ delete swr_fs; -+} -+ -+ -+static void -+swr_set_constant_buffer(struct pipe_context *pipe, -+ uint shader, -+ uint index, -+ struct pipe_constant_buffer *cb) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ struct pipe_resource *constants = cb ? cb->buffer : NULL; -+ -+ assert(shader < PIPE_SHADER_TYPES); -+ assert(index < Elements(ctx->constants[shader])); -+ -+ /* note: reference counting */ -+ util_copy_constant_buffer(&ctx->constants[shader][index], cb); -+ -+ if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY) { -+ ctx->dirty |= SWR_NEW_VSCONSTANTS; -+ } else if (shader == PIPE_SHADER_FRAGMENT) { -+ ctx->dirty |= SWR_NEW_FSCONSTANTS; -+ } -+ -+ if (cb && cb->user_buffer) { -+ pipe_resource_reference(&constants, NULL); -+ } -+} -+ -+ -+static void * -+swr_create_vertex_elements_state(struct pipe_context *pipe, -+ unsigned num_elements, -+ const struct pipe_vertex_element *attribs) -+{ -+ struct swr_vertex_element_state *velems; -+ assert(num_elements <= PIPE_MAX_ATTRIBS); -+ velems = CALLOC_STRUCT(swr_vertex_element_state); -+ if (velems) { -+ velems->fsState.numAttribs = num_elements; -+ for (unsigned i = 0; i < num_elements; i++) { -+ // XXX: we should do this keyed on the VS usage info -+ -+ const struct util_format_description *desc = -+ util_format_description(attribs[i].src_format); -+ -+ velems->fsState.layout[i].AlignedByteOffset = attribs[i].src_offset; -+ velems->fsState.layout[i].Format = -+ mesa_to_swr_format(attribs[i].src_format); -+ velems->fsState.layout[i].StreamIndex = -+ attribs[i].vertex_buffer_index; -+ velems->fsState.layout[i].InstanceEnable = -+ attribs[i].instance_divisor != 0; -+ velems->fsState.layout[i].ComponentControl0 = -+ desc->channel[0].type != UTIL_FORMAT_TYPE_VOID -+ ? ComponentControl::StoreSrc -+ : ComponentControl::Store0; -+ velems->fsState.layout[i].ComponentControl1 = -+ desc->channel[1].type != UTIL_FORMAT_TYPE_VOID -+ ? ComponentControl::StoreSrc -+ : ComponentControl::Store0; -+ velems->fsState.layout[i].ComponentControl2 = -+ desc->channel[2].type != UTIL_FORMAT_TYPE_VOID -+ ? ComponentControl::StoreSrc -+ : ComponentControl::Store0; -+ velems->fsState.layout[i].ComponentControl3 = -+ desc->channel[3].type != UTIL_FORMAT_TYPE_VOID -+ ? ComponentControl::StoreSrc -+ : ComponentControl::Store1Fp; -+ velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW; -+ velems->fsState.layout[i].InstanceDataStepRate = -+ attribs[i].instance_divisor; -+ -+ /* Calculate the pitch of each stream */ -+ const SWR_FORMAT_INFO &swr_desc = GetFormatInfo( -+ mesa_to_swr_format(attribs[i].src_format)); -+ velems->stream_pitch[attribs[i].vertex_buffer_index] += swr_desc.Bpp; -+ } -+ } -+ -+ return velems; -+} -+ -+static void -+swr_bind_vertex_elements_state(struct pipe_context *pipe, void *velems) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ struct swr_vertex_element_state *swr_velems = -+ (struct swr_vertex_element_state *)velems; -+ -+ ctx->velems = swr_velems; -+ ctx->dirty |= SWR_NEW_VERTEX; -+} -+ -+static void -+swr_delete_vertex_elements_state(struct pipe_context *pipe, void *velems) -+{ -+ /* XXX Need to destroy fetch shader? */ -+ FREE(velems); -+} -+ -+ -+static void -+swr_set_vertex_buffers(struct pipe_context *pipe, -+ unsigned start_slot, -+ unsigned num_elements, -+ const struct pipe_vertex_buffer *buffers) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ assert(num_elements <= PIPE_MAX_ATTRIBS); -+ -+ util_set_vertex_buffers_count(ctx->vertex_buffer, -+ &ctx->num_vertex_buffers, -+ buffers, -+ start_slot, -+ num_elements); -+ -+ ctx->dirty |= SWR_NEW_VERTEX; -+} -+ -+ -+static void -+swr_set_index_buffer(struct pipe_context *pipe, -+ const struct pipe_index_buffer *ib) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ if (ib) -+ memcpy(&ctx->index_buffer, ib, sizeof(ctx->index_buffer)); -+ else -+ memset(&ctx->index_buffer, 0, sizeof(ctx->index_buffer)); -+ -+ ctx->dirty |= SWR_NEW_VERTEX; -+} -+ -+static void -+swr_set_polygon_stipple(struct pipe_context *pipe, -+ const struct pipe_poly_stipple *stipple) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ ctx->poly_stipple = *stipple; /* struct copy */ -+ ctx->dirty |= SWR_NEW_STIPPLE; -+} -+ -+static void -+swr_set_clip_state(struct pipe_context *pipe, -+ const struct pipe_clip_state *clip) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ ctx->clip = *clip; -+ /* XXX Unimplemented, but prevents crash */ -+ -+ ctx->dirty |= SWR_NEW_CLIP; -+} -+ -+ -+static void -+swr_set_scissor_states(struct pipe_context *pipe, -+ unsigned start_slot, -+ unsigned num_viewports, -+ const struct pipe_scissor_state *scissor) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ ctx->scissor = *scissor; -+ ctx->dirty |= SWR_NEW_SCISSOR; -+} -+ -+static void -+swr_set_viewport_states(struct pipe_context *pipe, -+ unsigned start_slot, -+ unsigned num_viewports, -+ const struct pipe_viewport_state *vpt) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ ctx->viewport = *vpt; -+ ctx->dirty |= SWR_NEW_VIEWPORT; -+} -+ -+ -+static void -+swr_set_framebuffer_state(struct pipe_context *pipe, -+ const struct pipe_framebuffer_state *fb) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ boolean changed = !util_framebuffer_state_equal(&ctx->framebuffer, fb); -+ -+ assert(fb->width <= KNOB_GUARDBAND_WIDTH); -+ assert(fb->height <= KNOB_GUARDBAND_HEIGHT); -+ -+ if (changed) { -+ unsigned i; -+ for (i = 0; i < fb->nr_cbufs; ++i) -+ pipe_surface_reference(&ctx->framebuffer.cbufs[i], fb->cbufs[i]); -+ for (; i < ctx->framebuffer.nr_cbufs; ++i) -+ pipe_surface_reference(&ctx->framebuffer.cbufs[i], NULL); -+ -+ ctx->framebuffer.nr_cbufs = fb->nr_cbufs; -+ -+ ctx->framebuffer.width = fb->width; -+ ctx->framebuffer.height = fb->height; -+ -+ pipe_surface_reference(&ctx->framebuffer.zsbuf, fb->zsbuf); -+ -+ ctx->dirty |= SWR_NEW_FRAMEBUFFER; -+ } -+} -+ -+ -+static void -+swr_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask) -+{ -+ struct swr_context *ctx = swr_context(pipe); -+ -+ if (sample_mask != ctx->sample_mask) { -+ ctx->sample_mask = sample_mask; -+ ctx->dirty |= SWR_NEW_RASTERIZER; -+ } -+} -+ -+ -+void -+swr_update_derived(struct swr_context *ctx, -+ const struct pipe_draw_info *p_draw_info) -+{ -+ /* Any state that requires dirty flags to be re-triggered sets this mask */ -+ /* For example, user_buffer vertex and index buffers. */ -+ unsigned post_update_dirty_flags = 0; -+ -+ /* Render Targets */ -+ if (ctx->dirty & SWR_NEW_FRAMEBUFFER) { -+ struct pipe_framebuffer_state *fb = &ctx->framebuffer; -+ SWR_SURFACE_STATE *new_attachment[SWR_NUM_ATTACHMENTS] = {0}; -+ boolean changed, need_idle; -+ UINT i; -+ -+ /* colorbuffer targets */ -+ if (fb->nr_cbufs) -+ for (i = 0; i < fb->nr_cbufs; ++i) -+ if (fb->cbufs[i]) { -+ struct swr_resource *colorBuffer = -+ swr_resource(fb->cbufs[i]->texture); -+ new_attachment[SWR_ATTACHMENT_COLOR0 + i] = &colorBuffer->swr; -+ } -+ -+ /* depth/stencil target */ -+ if (fb->zsbuf) { -+ struct swr_resource *depthStencilBuffer = -+ swr_resource(fb->zsbuf->texture); -+ if (depthStencilBuffer->has_depth) { -+ new_attachment[SWR_ATTACHMENT_DEPTH] = &depthStencilBuffer->swr; -+ -+ if (depthStencilBuffer->has_stencil) -+ new_attachment[SWR_ATTACHMENT_STENCIL] = -+ &depthStencilBuffer->secondary; -+ -+ } else if (depthStencilBuffer->has_stencil) -+ new_attachment[SWR_ATTACHMENT_STENCIL] = &depthStencilBuffer->swr; -+ } -+ -+ /* For each attachment that has changed, store tile contents to render -+ * target */ -+ changed = FALSE; -+ need_idle = FALSE; -+ for (i = 0; i < SWR_NUM_ATTACHMENTS; i++) { -+ if ((uintptr_t)ctx->current.attachment[i] -+ ^ (uintptr_t)new_attachment[i]) { -+ if (ctx->current.attachment[i]) { -+ enum SWR_TILE_STATE post_state; -+ post_state = -+ (new_attachment[i] ? SWR_TILE_INVALID : SWR_TILE_RESOLVED); -+ swr_store_render_target(ctx, i, post_state); -+ need_idle |= TRUE; -+ } -+ changed |= TRUE; -+ } -+ } -+ -+ /* -+ * Attachments are live, don't update any until idle -+ * (all StoreTiles, called by swr_store_render_targets, finish) -+ */ -+ if (need_idle) -+ SwrWaitForIdle(ctx->swrContext); -+ -+ if (changed) { -+ /* Update actual SWR core attachments, or clear those no longer -+ * attached */ -+ swr_draw_context *pDC = -+ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); -+ SWR_SURFACE_STATE *renderTargets = pDC->renderTargets; -+ for (i = 0; i < SWR_NUM_ATTACHMENTS; i++) { -+ if ((uintptr_t)ctx->current.attachment[i] -+ ^ (uintptr_t)new_attachment[i]) { -+ if (new_attachment[i]) { -+ renderTargets[i] = *new_attachment[i]; -+ ctx->current.attachment[i] = new_attachment[i]; -+ } else { -+ renderTargets[i] = {0}; -+ ctx->current.attachment[i] = nullptr; -+ } -+ } -+ } -+ -+ /* rendertarget changes also necessitate updating other state */ -+ ctx->dirty |= SWR_NEW_BLEND | SWR_NEW_SAMPLER_VIEW | SWR_NEW_VS -+ | SWR_NEW_FS | SWR_NEW_RASTERIZER | SWR_NEW_VIEWPORT -+ | SWR_NEW_DEPTH_STENCIL_ALPHA; -+ } -+ } -+ -+ /* Raster state */ -+ if (ctx->dirty & (SWR_NEW_RASTERIZER | SWR_NEW_VS)) { -+ SWR_RASTSTATE *rastState = &ctx->current.rastState; -+ rastState->cullMode = swr_convert_cull_mode(ctx->rasterizer->cull_face); -+ rastState->frontWinding = ctx->rasterizer->front_ccw -+ ? SWR_FRONTWINDING_CCW -+ : SWR_FRONTWINDING_CW; -+ rastState->scissorEnable = ctx->rasterizer->scissor; -+ rastState->pointSize = ctx->rasterizer->point_size > 0.0f -+ ? ctx->rasterizer->point_size -+ : 1.0f; -+ rastState->lineWidth = ctx->rasterizer->line_width > 0.0f -+ ? ctx->rasterizer->line_width -+ : 1.0f; -+ -+ rastState->pointParam = ctx->rasterizer->point_size_per_vertex; -+ rastState->pointSizeAttrib = ctx->vs->pointSizeAttrib; -+ -+ rastState->pointSpriteEnable = ctx->rasterizer->sprite_coord_enable; -+ rastState->pointSpriteTopOrigin = -+ ctx->rasterizer->sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT; -+ rastState->pointSpriteFESlot = ctx->vs->info.base.num_outputs; -+ -+ /* XXX TODO: Add multisample */ -+ rastState->sampleCount = SWR_MULTISAMPLE_1X; -+ -+ bool do_offset = false; -+ switch (ctx->rasterizer->fill_front) { -+ case PIPE_POLYGON_MODE_FILL: -+ do_offset = ctx->rasterizer->offset_tri; -+ break; -+ case PIPE_POLYGON_MODE_LINE: -+ do_offset = ctx->rasterizer->offset_line; -+ break; -+ case PIPE_POLYGON_MODE_POINT: -+ do_offset = ctx->rasterizer->offset_point; -+ break; -+ } -+ -+ if (do_offset) { -+ rastState->depthBias = ctx->rasterizer->offset_units; -+ rastState->slopeScaledDepthBias = ctx->rasterizer->offset_scale; -+ rastState->depthBiasClamp = ctx->rasterizer->offset_clamp; -+ } else { -+ rastState->depthBias = 0; -+ rastState->slopeScaledDepthBias = 0; -+ rastState->depthBiasClamp = 0; -+ } -+ struct pipe_surface *zb = ctx->framebuffer.zsbuf; -+ if (zb && swr_resource(zb->texture)->has_depth) -+ rastState->depthFormat = swr_resource(zb->texture)->swr.format; -+ -+ rastState->depthClipEnable = ctx->rasterizer->depth_clip; -+ -+ SwrSetRastState(ctx->swrContext, rastState); -+ } -+ -+ /* Scissor */ -+ if (ctx->dirty & SWR_NEW_SCISSOR) { -+ BBOX bbox(ctx->scissor.miny, ctx->scissor.maxy, -+ ctx->scissor.minx, ctx->scissor.maxx); -+ SwrSetScissorRects(ctx->swrContext, 1, &bbox); -+ } -+ -+ /* Viewport */ -+ if (ctx->dirty & SWR_NEW_VIEWPORT) { -+ pipe_viewport_state *state = &ctx->viewport; -+ SWR_VIEWPORT *vp = &ctx->current.vp; -+ SWR_VIEWPORT_MATRIX *vpm = &ctx->current.vpm; -+ -+ const float scale_x = fabs(state->scale[0]); -+ const float scale_y = fabs(state->scale[1]); -+ const float scale_z = fabs(state->scale[2]); -+ -+ vp->x = state->translate[0] - scale_x; -+ vp->width = state->translate[0] + scale_x; -+ vp->y = state->translate[1] - scale_y; -+ vp->height = state->translate[1] + scale_y; -+ if (ctx->rasterizer->clip_halfz == 0) { -+ vp->minZ = state->translate[2] - scale_z; -+ vp->maxZ = state->translate[2] + scale_z; -+ } else { -+ vp->minZ = state->translate[2]; -+ vp->maxZ = state->translate[2] + scale_z; -+ } -+ -+ /* Flip viewport for all targets except samplable textures. */ -+ /* XXX This may not be sufficient for multiple rendertargets */ -+ struct pipe_surface *cb = ctx->framebuffer.cbufs[0]; -+ if (cb && -+ !(swr_resource(cb->texture)->base.bind & PIPE_BIND_SAMPLER_VIEW)) { -+ /* Flip y and y-translate in the viewport matrix. */ -+ vpm->m00 = (vp->width - vp->x) / 2.0f; -+ vpm->m11 = (vp->y - vp->height) / 2.0f; -+ vpm->m22 = (vp->maxZ - vp->minZ) / 2.0f; -+ vpm->m30 = vp->x + vpm->m00; -+ vpm->m31 = vp->height + vpm->m11; -+ vpm->m32 = vp->minZ + vpm->m22; -+ } else { -+ vpm->m00 = (vp->width - vp->x) / 2.0f; -+ vpm->m11 = (vp->height - vp->y) / 2.0f; -+ vpm->m22 = (vp->maxZ - vp->minZ) / 2.0f; -+ vpm->m30 = vp->x + vpm->m00; -+ vpm->m31 = vp->y + vpm->m11; -+ vpm->m32 = vp->minZ + vpm->m22; -+ } -+ -+ /* Now that the matrix is calculated, clip the view coords to screen -+ * size. OpenGL allows for -ve x,y in the viewport. -+ */ -+ vp->x = std::max(vp->x, 0.0f); -+ vp->y = std::max(vp->y, 0.0f); -+ vp->width = std::min(vp->width, (float)ctx->framebuffer.width); -+ vp->height = std::min(vp->height, (float)ctx->framebuffer.height); -+ -+ SwrSetViewports(ctx->swrContext, 1, vp, vpm); -+ } -+ -+ /* Set vertex & index buffers */ -+ /* (using draw info if called by swr_draw_vbo) */ -+ if (ctx->dirty & SWR_NEW_VERTEX) { -+ uint32_t size, pitch, max_vertex, partial_inbounds; -+ const uint8_t *p_data; -+ -+ /* If being called by swr_draw_vbo, copy draw details */ -+ struct pipe_draw_info info = {0}; -+ if (p_draw_info) -+ info = *p_draw_info; -+ -+ /* vertex buffers */ -+ SWR_VERTEX_BUFFER_STATE swrVertexBuffers[PIPE_MAX_ATTRIBS]; -+ for (UINT i = 0; i < ctx->num_vertex_buffers; i++) { -+ pipe_vertex_buffer *vb = &ctx->vertex_buffer[i]; -+ -+ pitch = vb->stride; -+ if (!vb->user_buffer) { -+ /* VBO -+ * size is based on buffer->width0 rather than info.max_index -+ * to prevent having to validate VBO on each draw */ -+ size = vb->buffer->width0; -+ max_vertex = size / pitch; -+ partial_inbounds = size % pitch; -+ -+ p_data = (const uint8_t *)swr_resource_data(vb->buffer) -+ + vb->buffer_offset; -+ } else { -+ /* Client buffer -+ * client memory is one-time use, re-trigger SWR_NEW_VERTEX to -+ * revalidate on each draw */ -+ post_update_dirty_flags |= SWR_NEW_VERTEX; -+ -+ if (pitch) { -+ size = (info.max_index - info.min_index + 1) * pitch; -+ } else { -+ /* pitch = 0, means constant value -+ * set size to 1 vertex */ -+ size = ctx->velems->stream_pitch[i]; -+ } -+ -+ max_vertex = info.max_index + 1; -+ partial_inbounds = 0; -+ -+ /* Copy only needed vertices to scratch space */ -+ size = AlignUp(size, 4); -+ const void *ptr = (const uint8_t *) vb->user_buffer -+ + info.min_index * pitch; -+ ptr = swr_copy_to_scratch_space( -+ ctx, &ctx->scratch->vertex_buffer, ptr, size); -+ p_data = (const uint8_t *)ptr - info.min_index * pitch; -+ } -+ -+ swrVertexBuffers[i] = {0}; -+ swrVertexBuffers[i].index = i; -+ swrVertexBuffers[i].pitch = pitch; -+ swrVertexBuffers[i].pData = p_data; -+ swrVertexBuffers[i].size = size; -+ swrVertexBuffers[i].maxVertex = max_vertex; -+ swrVertexBuffers[i].partialInboundsSize = partial_inbounds; -+ } -+ -+ SwrSetVertexBuffers( -+ ctx->swrContext, ctx->num_vertex_buffers, swrVertexBuffers); -+ -+ /* index buffer, if required (info passed in by swr_draw_vbo) */ -+ SWR_FORMAT index_type = R32_UINT; /* Default for non-indexed draws */ -+ if (info.indexed) { -+ pipe_index_buffer *ib = &ctx->index_buffer; -+ -+ pitch = ib->index_size ? ib->index_size : sizeof(uint32_t); -+ index_type = swr_convert_index_type(pitch); -+ -+ if (!ib->user_buffer) { -+ /* VBO -+ * size is based on buffer->width0 rather than info.count -+ * to prevent having to validate VBO on each draw */ -+ size = ib->buffer->width0; -+ p_data = -+ (const uint8_t *)swr_resource_data(ib->buffer) + ib->offset; -+ } else { -+ /* Client buffer -+ * client memory is one-time use, re-trigger SWR_NEW_VERTEX to -+ * revalidate on each draw */ -+ post_update_dirty_flags |= SWR_NEW_VERTEX; -+ -+ size = info.count * pitch; -+ size = AlignUp(size, 4); -+ -+ /* Copy indices to scratch space */ -+ const void *ptr = ib->user_buffer; -+ ptr = swr_copy_to_scratch_space( -+ ctx, &ctx->scratch->index_buffer, ptr, size); -+ p_data = (const uint8_t *)ptr; -+ } -+ -+ SWR_INDEX_BUFFER_STATE swrIndexBuffer; -+ swrIndexBuffer.format = swr_convert_index_type(ib->index_size); -+ swrIndexBuffer.pIndices = p_data; -+ swrIndexBuffer.size = size; -+ -+ SwrSetIndexBuffer(ctx->swrContext, &swrIndexBuffer); -+ } -+ -+ struct swr_vertex_element_state *velems = ctx->velems; -+ if (velems && velems->fsState.indexType != index_type) { -+ velems->fsFunc = NULL; -+ velems->fsState.indexType = index_type; -+ } -+ } -+ -+ /* VertexShader */ -+ if (ctx->dirty & SWR_NEW_VS) { -+ SwrSetVertexFunc(ctx->swrContext, ctx->vs->func); -+ } -+ -+ swr_jit_key key; -+ if (ctx->dirty & (SWR_NEW_FS | SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW -+ | SWR_NEW_DEPTH_STENCIL_ALPHA | SWR_NEW_RASTERIZER -+ | SWR_NEW_FRAMEBUFFER)) { -+ memset(&key, 0, sizeof(key)); -+ swr_generate_fs_key(key, ctx, ctx->fs); -+ auto search = ctx->fs->map.find(key); -+ PFN_PIXEL_KERNEL func; -+ if (search != ctx->fs->map.end()) { -+ func = search->second; -+ } else { -+ func = swr_compile_fs(ctx, key); -+ ctx->fs->map.insert(std::make_pair(key, func)); -+ } -+ SWR_PS_STATE psState = {0}; -+ psState.pfnPixelShader = func; -+ psState.killsPixel = -+ ctx->fs->info.base.uses_kill || key.alphaTest.enabled; -+ psState.writesODepth = ctx->fs->info.base.writes_z; -+ psState.usesSourceDepth = ctx->fs->info.base.reads_z; -+ psState.maxRTSlotUsed = -+ (ctx->framebuffer.nr_cbufs != 0) ? -+ (ctx->framebuffer.nr_cbufs - 1) : -+ 0; -+ SwrSetPixelShaderState(ctx->swrContext, &psState); -+ } -+ -+ /* JIT sampler state */ -+ if (ctx->dirty & SWR_NEW_SAMPLER) { -+ swr_draw_context *pDC = -+ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); -+ -+ for (unsigned i = 0; i < key.nr_samplers; i++) { -+ const struct pipe_sampler_state *sampler = -+ ctx->samplers[PIPE_SHADER_FRAGMENT][i]; -+ -+ if (sampler) { -+ pDC->samplersFS[i].min_lod = sampler->min_lod; -+ pDC->samplersFS[i].max_lod = sampler->max_lod; -+ pDC->samplersFS[i].lod_bias = sampler->lod_bias; -+ COPY_4V(pDC->samplersFS[i].border_color, sampler->border_color.f); -+ } -+ } -+ } -+ -+ /* JIT sampler view state */ -+ if (ctx->dirty & SWR_NEW_SAMPLER_VIEW) { -+ swr_draw_context *pDC = -+ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); -+ -+ for (unsigned i = 0; i < key.nr_sampler_views; i++) { -+ struct pipe_sampler_view *view = -+ ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]; -+ -+ if (view) { -+ struct pipe_resource *res = view->texture; -+ struct swr_resource *swr_res = swr_resource(res); -+ struct swr_jit_texture *jit_tex = &pDC->texturesFS[i]; -+ memset(jit_tex, 0, sizeof(*jit_tex)); -+ jit_tex->width = res->width0; -+ jit_tex->height = res->height0; -+ jit_tex->depth = res->depth0; -+ jit_tex->first_level = view->u.tex.first_level; -+ jit_tex->last_level = view->u.tex.last_level; -+ jit_tex->base_ptr = swr_res->swr.pBaseAddress; -+ -+ for (unsigned level = jit_tex->first_level; -+ level <= jit_tex->last_level; -+ level++) { -+ jit_tex->row_stride[level] = swr_res->row_stride[level]; -+ jit_tex->img_stride[level] = swr_res->img_stride[level]; -+ jit_tex->mip_offsets[level] = swr_res->mip_offsets[level]; -+ } -+ } -+ } -+ } -+ -+ /* VertexShader Constants */ -+ if (ctx->dirty & SWR_NEW_VSCONSTANTS) { -+ swr_draw_context *pDC = -+ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); -+ -+ for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { -+ const pipe_constant_buffer *cb = -+ &ctx->constants[PIPE_SHADER_VERTEX][i]; -+ pDC->num_constantsVS[i] = cb->buffer_size; -+ if (cb->buffer) -+ pDC->constantVS[i] = -+ (const float *)((const BYTE *)cb->buffer + cb->buffer_offset); -+ else { -+ /* Need to copy these constants to scratch space */ -+ if (cb->user_buffer && cb->buffer_size) { -+ const void *ptr = -+ ((const BYTE *)cb->user_buffer + cb->buffer_offset); -+ uint32_t size = AlignUp(cb->buffer_size, 4); -+ ptr = swr_copy_to_scratch_space( -+ ctx, &ctx->scratch->vs_constants, ptr, size); -+ pDC->constantVS[i] = (const float *)ptr; -+ } -+ } -+ } -+ } -+ -+ /* FragmentShader Constants */ -+ if (ctx->dirty & SWR_NEW_FSCONSTANTS) { -+ swr_draw_context *pDC = -+ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); -+ -+ for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { -+ const pipe_constant_buffer *cb = -+ &ctx->constants[PIPE_SHADER_FRAGMENT][i]; -+ pDC->num_constantsFS[i] = cb->buffer_size; -+ if (cb->buffer) -+ pDC->constantFS[i] = -+ (const float *)((const BYTE *)cb->buffer + cb->buffer_offset); -+ else { -+ /* Need to copy these constants to scratch space */ -+ if (cb->user_buffer && cb->buffer_size) { -+ const void *ptr = -+ ((const BYTE *)cb->user_buffer + cb->buffer_offset); -+ uint32_t size = AlignUp(cb->buffer_size, 4); -+ ptr = swr_copy_to_scratch_space( -+ ctx, &ctx->scratch->fs_constants, ptr, size); -+ pDC->constantFS[i] = (const float *)ptr; -+ } -+ } -+ } -+ } -+ -+ /* Depth/stencil state */ -+ if (ctx->dirty & SWR_NEW_DEPTH_STENCIL_ALPHA) { -+ struct pipe_depth_state *depth = &(ctx->depth_stencil->depth); -+ struct pipe_stencil_state *stencil = ctx->depth_stencil->stencil; -+ SWR_DEPTH_STENCIL_STATE depthStencilState = {{0}}; -+ -+ /* XXX, incomplete. Need to flesh out stencil & alpha test state -+ struct pipe_stencil_state *front_stencil = -+ ctx->depth_stencil.stencil[0]; -+ struct pipe_stencil_state *back_stencil = ctx->depth_stencil.stencil[1]; -+ struct pipe_alpha_state alpha; -+ */ -+ if (stencil[0].enabled) { -+ depthStencilState.stencilWriteEnable = 1; -+ depthStencilState.stencilTestEnable = 1; -+ depthStencilState.stencilTestFunc = -+ swr_convert_depth_func(stencil[0].func); -+ -+ depthStencilState.stencilPassDepthPassOp = -+ swr_convert_stencil_op(stencil[0].zpass_op); -+ depthStencilState.stencilPassDepthFailOp = -+ swr_convert_stencil_op(stencil[0].zfail_op); -+ depthStencilState.stencilFailOp = -+ swr_convert_stencil_op(stencil[0].fail_op); -+ depthStencilState.stencilWriteMask = stencil[0].writemask; -+ depthStencilState.stencilTestMask = stencil[0].valuemask; -+ depthStencilState.stencilRefValue = ctx->stencil_ref.ref_value[0]; -+ } -+ if (stencil[1].enabled) { -+ depthStencilState.doubleSidedStencilTestEnable = 1; -+ -+ depthStencilState.backfaceStencilTestFunc = -+ swr_convert_depth_func(stencil[1].func); -+ -+ depthStencilState.backfaceStencilPassDepthPassOp = -+ swr_convert_stencil_op(stencil[1].zpass_op); -+ depthStencilState.backfaceStencilPassDepthFailOp = -+ swr_convert_stencil_op(stencil[1].zfail_op); -+ depthStencilState.backfaceStencilFailOp = -+ swr_convert_stencil_op(stencil[1].fail_op); -+ depthStencilState.backfaceStencilWriteMask = stencil[1].writemask; -+ depthStencilState.backfaceStencilTestMask = stencil[1].valuemask; -+ -+ depthStencilState.backfaceStencilRefValue = -+ ctx->stencil_ref.ref_value[1]; -+ } -+ -+ depthStencilState.depthTestEnable = depth->enabled; -+ depthStencilState.depthTestFunc = swr_convert_depth_func(depth->func); -+ depthStencilState.depthWriteEnable = depth->writemask; -+ SwrSetDepthStencilState(ctx->swrContext, &depthStencilState); -+ } -+ -+ /* Blend State */ -+ if (ctx->dirty & (SWR_NEW_BLEND | SWR_NEW_FRAMEBUFFER)) { -+ struct pipe_framebuffer_state *fb = &ctx->framebuffer; -+ -+ SWR_BLEND_STATE blendState; -+ memset(&blendState, 0, sizeof(blendState)); -+ blendState.independentAlphaBlendEnable = -+ ctx->blend->pipe.independent_blend_enable; -+ blendState.constantColor[0] = ctx->blend_color.color[0]; -+ blendState.constantColor[1] = ctx->blend_color.color[1]; -+ blendState.constantColor[2] = ctx->blend_color.color[2]; -+ blendState.constantColor[3] = ctx->blend_color.color[3]; -+ -+ /* If there are no color buffers bound, disable writes on RT0 -+ * and skip loop */ -+ if (fb->nr_cbufs == 0) { -+ blendState.renderTarget[0].writeDisableRed = 1; -+ blendState.renderTarget[0].writeDisableGreen = 1; -+ blendState.renderTarget[0].writeDisableBlue = 1; -+ blendState.renderTarget[0].writeDisableAlpha = 1; -+ } -+ else -+ for (int target = 0; -+ target < std::min(SWR_NUM_RENDERTARGETS, -+ PIPE_MAX_COLOR_BUFS); -+ target++) { -+ if (!fb->cbufs[target]) -+ continue; -+ -+ BLEND_COMPILE_STATE *compileState = -+ &ctx->blend->compileState[target]; -+ -+ struct swr_resource *colorBuffer = -+ swr_resource(fb->cbufs[target]->texture); -+ compileState->format = colorBuffer->swr.format; -+ -+ memcpy(&blendState.renderTarget[target], -+ &compileState->blendState, -+ sizeof(compileState->blendState)); -+ -+ PFN_BLEND_JIT_FUNC func = NULL; -+ auto search = ctx->blendJIT->find(*compileState); -+ if (search != ctx->blendJIT->end()) { -+ func = search->second; -+ } else { -+ HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr; -+ func = JitCompileBlend(hJitMgr, *compileState); -+ debug_printf("BLEND shader %p\n", func); -+ assert(func && "Error: BlendShader = NULL"); -+ -+ ctx->blendJIT->insert(std::make_pair(*compileState, func)); -+ } -+ SwrSetBlendFunc(ctx->swrContext, target, func); -+ } -+ -+ SwrSetBlendState(ctx->swrContext, &blendState); -+ } -+ -+ if (ctx->dirty & SWR_NEW_STIPPLE) { -+ /* XXX What to do with this one??? SWR doesn't stipple */ -+ } -+ -+ if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_SO | SWR_NEW_RASTERIZER)) { -+ ctx->vs->soState.rasterizerDisable = -+ ctx->rasterizer->rasterizer_discard; -+ SwrSetSoState(ctx->swrContext, &ctx->vs->soState); -+ -+ pipe_stream_output_info *stream_output = &ctx->vs->pipe.stream_output; -+ -+ for (uint32_t i = 0; i < ctx->num_so_targets; i++) { -+ SWR_STREAMOUT_BUFFER buffer = {0}; -+ if (!ctx->so_targets[i]) -+ continue; -+ buffer.enable = true; -+ buffer.pBuffer = -+ (uint32_t *)swr_resource_data(ctx->so_targets[i]->buffer); -+ buffer.bufferSize = ctx->so_targets[i]->buffer_size >> 2; -+ buffer.pitch = stream_output->stride[i]; -+ buffer.streamOffset = ctx->so_targets[i]->buffer_offset >> 2; -+ -+ SwrSetSoBuffers(ctx->swrContext, &buffer, i); -+ } -+ } -+ -+ uint32_t linkage = ctx->vs->linkageMask; -+ if (ctx->rasterizer->sprite_coord_enable) -+ linkage |= (1 << ctx->vs->info.base.num_outputs); -+ -+ SwrSetLinkage(ctx->swrContext, linkage, NULL); -+ -+ // set up frontend state -+ SWR_FRONTEND_STATE feState = {0}; -+ SwrSetFrontendState(ctx->swrContext, &feState); -+ -+ // set up backend state -+ SWR_BACKEND_STATE backendState = {0}; -+ backendState.numAttributes = 1; -+ backendState.numComponents[0] = 4; -+ backendState.constantInterpolationMask = ctx->fs->constantMask; -+ SwrSetBackendState(ctx->swrContext, &backendState); -+ -+ ctx->dirty = post_update_dirty_flags; -+} -+ -+static struct pipe_stream_output_target * -+swr_create_so_target(struct pipe_context *pipe, -+ struct pipe_resource *buffer, -+ unsigned buffer_offset, -+ unsigned buffer_size) -+{ -+ struct pipe_stream_output_target *target; -+ -+ target = CALLOC_STRUCT(pipe_stream_output_target); -+ if (!target) -+ return NULL; -+ -+ target->context = pipe; -+ target->reference.count = 1; -+ pipe_resource_reference(&target->buffer, buffer); -+ target->buffer_offset = buffer_offset; -+ target->buffer_size = buffer_size; -+ return target; -+} -+ -+static void -+swr_destroy_so_target(struct pipe_context *pipe, -+ struct pipe_stream_output_target *target) -+{ -+ pipe_resource_reference(&target->buffer, NULL); -+ FREE(target); -+} -+ -+static void -+swr_set_so_targets(struct pipe_context *pipe, -+ unsigned num_targets, -+ struct pipe_stream_output_target **targets, -+ const unsigned *offsets) -+{ -+ struct swr_context *swr = swr_context(pipe); -+ uint32_t i; -+ -+ assert(num_targets < MAX_SO_STREAMS); -+ -+ for (i = 0; i < num_targets; i++) { -+ pipe_so_target_reference( -+ (struct pipe_stream_output_target **)&swr->so_targets[i], -+ targets[i]); -+ } -+ -+ for (/* fall-through */; i < swr->num_so_targets; i++) { -+ pipe_so_target_reference( -+ (struct pipe_stream_output_target **)&swr->so_targets[i], NULL); -+ } -+ -+ swr->num_so_targets = num_targets; -+ -+ swr->dirty = SWR_NEW_SO; -+} -+ -+ -+void -+swr_state_init(struct pipe_context *pipe) -+{ -+ pipe->create_blend_state = swr_create_blend_state; -+ pipe->bind_blend_state = swr_bind_blend_state; -+ pipe->delete_blend_state = swr_delete_blend_state; -+ -+ pipe->create_depth_stencil_alpha_state = swr_create_depth_stencil_state; -+ pipe->bind_depth_stencil_alpha_state = swr_bind_depth_stencil_state; -+ pipe->delete_depth_stencil_alpha_state = swr_delete_depth_stencil_state; -+ -+ pipe->create_rasterizer_state = swr_create_rasterizer_state; -+ pipe->bind_rasterizer_state = swr_bind_rasterizer_state; -+ pipe->delete_rasterizer_state = swr_delete_rasterizer_state; -+ -+ pipe->create_sampler_state = swr_create_sampler_state; -+ pipe->bind_sampler_states = swr_bind_sampler_states; -+ pipe->delete_sampler_state = swr_delete_sampler_state; -+ -+ pipe->create_sampler_view = swr_create_sampler_view; -+ pipe->set_sampler_views = swr_set_sampler_views; -+ pipe->sampler_view_destroy = swr_sampler_view_destroy; -+ -+ pipe->create_vs_state = swr_create_vs_state; -+ pipe->bind_vs_state = swr_bind_vs_state; -+ pipe->delete_vs_state = swr_delete_vs_state; -+ -+ pipe->create_fs_state = swr_create_fs_state; -+ pipe->bind_fs_state = swr_bind_fs_state; -+ pipe->delete_fs_state = swr_delete_fs_state; -+ -+ pipe->set_constant_buffer = swr_set_constant_buffer; -+ -+ pipe->create_vertex_elements_state = swr_create_vertex_elements_state; -+ pipe->bind_vertex_elements_state = swr_bind_vertex_elements_state; -+ pipe->delete_vertex_elements_state = swr_delete_vertex_elements_state; -+ -+ pipe->set_vertex_buffers = swr_set_vertex_buffers; -+ pipe->set_index_buffer = swr_set_index_buffer; -+ -+ pipe->set_polygon_stipple = swr_set_polygon_stipple; -+ pipe->set_clip_state = swr_set_clip_state; -+ pipe->set_scissor_states = swr_set_scissor_states; -+ pipe->set_viewport_states = swr_set_viewport_states; -+ -+ pipe->set_framebuffer_state = swr_set_framebuffer_state; -+ -+ pipe->set_blend_color = swr_set_blend_color; -+ pipe->set_stencil_ref = swr_set_stencil_ref; -+ -+ pipe->set_sample_mask = swr_set_sample_mask; -+ -+ pipe->create_stream_output_target = swr_create_so_target; -+ pipe->stream_output_target_destroy = swr_destroy_so_target; -+ pipe->set_stream_output_targets = swr_set_so_targets; -+} -diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h -new file mode 100644 -index 0000000..fdacd42 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_state.h -@@ -0,0 +1,240 @@ -+/**************************************************************************** -+ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ ***************************************************************************/ -+ -+#ifndef SWR_STATE_H -+#define SWR_STATE_H -+ -+#include "pipe/p_defines.h" -+#include "tgsi/tgsi_scan.h" -+#include "tgsi/tgsi_parse.h" -+#include "tgsi/tgsi_dump.h" -+#include "gallivm/lp_bld_tgsi.h" -+#include "util/u_hash.h" -+#include "api.h" -+#include "swr_tex_sample.h" -+#include "swr_shader.h" -+#include -+ -+/* skeleton */ -+struct swr_vertex_shader { -+ struct pipe_shader_state pipe; -+ struct lp_tgsi_info info; -+ unsigned linkageMask; -+ unsigned pointSizeAttrib; -+ PFN_VERTEX_FUNC func; -+ SWR_STREAMOUT_STATE soState; -+ PFN_SO_FUNC soFunc[PIPE_PRIM_MAX]; -+}; -+ -+struct swr_fragment_shader { -+ struct pipe_shader_state pipe; -+ struct lp_tgsi_info info; -+ unsigned constantMask; -+ std::unordered_map map; -+}; -+ -+/* Vertex element state */ -+struct swr_vertex_element_state { -+ FETCH_COMPILE_STATE fsState; -+ PFN_FETCH_FUNC fsFunc; -+#if 1 //BMCDEBUG -+ uint32_t stream_pitch[PIPE_MAX_ATTRIBS]; -+#endif -+}; -+ -+struct swr_blend_state { -+ struct pipe_blend_state pipe; -+ BLEND_COMPILE_STATE compileState[PIPE_MAX_COLOR_BUFS]; -+}; -+ -+/* Shadows of SWR API DrawState */ -+struct swr_shadow_state { -+ SWR_SURFACE_STATE *attachment[SWR_NUM_ATTACHMENTS]; -+ SWR_RASTSTATE rastState; -+ SWR_VIEWPORT vp; -+ SWR_VIEWPORT_MATRIX vpm; -+}; -+ -+void swr_update_derived(struct swr_context *, -+ const struct pipe_draw_info * = nullptr); -+ -+/* -+ * Conversion functions: Convert mesa state defines to SWR. -+ */ -+ -+static INLINE SWR_STENCILOP -+swr_convert_stencil_op(const UINT op) -+{ -+ switch (op) { -+ case PIPE_STENCIL_OP_KEEP: -+ return STENCILOP_KEEP; -+ case PIPE_STENCIL_OP_ZERO: -+ return STENCILOP_ZERO; -+ case PIPE_STENCIL_OP_REPLACE: -+ return STENCILOP_REPLACE; -+ case PIPE_STENCIL_OP_INCR: -+ return STENCILOP_INCRSAT; -+ case PIPE_STENCIL_OP_DECR: -+ return STENCILOP_DECRSAT; -+ case PIPE_STENCIL_OP_INCR_WRAP: -+ return STENCILOP_INCR; -+ case PIPE_STENCIL_OP_DECR_WRAP: -+ return STENCILOP_DECR; -+ case PIPE_STENCIL_OP_INVERT: -+ return STENCILOP_INVERT; -+ default: -+ assert(0 && "Unsupported stencil op"); -+ return STENCILOP_KEEP; -+ } -+} -+ -+static INLINE SWR_FORMAT -+swr_convert_index_type(const UINT index_size) -+{ -+ switch (index_size) { -+ case sizeof(unsigned char): -+ return R8_UINT; -+ case sizeof(unsigned short): -+ return R16_UINT; -+ case sizeof(unsigned int): -+ return R32_UINT; -+ default: -+ assert(0 && "Unsupported index type"); -+ return R32_UINT; -+ } -+} -+ -+ -+static INLINE UINT -+swr_convert_depth_func(const UINT pipe_func) -+{ -+ switch (pipe_func) { -+ case PIPE_FUNC_NEVER: -+ return ZFUNC_NEVER; -+ case PIPE_FUNC_LESS: -+ return ZFUNC_LT; -+ case PIPE_FUNC_EQUAL: -+ return ZFUNC_EQ; -+ case PIPE_FUNC_LEQUAL: -+ return ZFUNC_LE; -+ case PIPE_FUNC_GREATER: -+ return ZFUNC_GT; -+ case PIPE_FUNC_NOTEQUAL: -+ return ZFUNC_NE; -+ case PIPE_FUNC_GEQUAL: -+ return ZFUNC_GE; -+ case PIPE_FUNC_ALWAYS: -+ return ZFUNC_ALWAYS; -+ default: -+ assert(0 && "Unsupported depth func"); -+ return ZFUNC_ALWAYS; -+ } -+} -+ -+ -+static INLINE SWR_CULLMODE -+swr_convert_cull_mode(const UINT cull_face) -+{ -+ switch (cull_face) { -+ case PIPE_FACE_NONE: -+ return SWR_CULLMODE_NONE; -+ case PIPE_FACE_FRONT: -+ return SWR_CULLMODE_FRONT; -+ case PIPE_FACE_BACK: -+ return SWR_CULLMODE_BACK; -+ case PIPE_FACE_FRONT_AND_BACK: -+ return SWR_CULLMODE_BOTH; -+ default: -+ assert(0 && "Invalid cull mode"); -+ return SWR_CULLMODE_NONE; -+ } -+} -+ -+static INLINE SWR_BLEND_OP -+swr_convert_blend_func(const UINT blend_func) -+{ -+ switch (blend_func) { -+ case PIPE_BLEND_ADD: -+ return BLENDOP_ADD; -+ case PIPE_BLEND_SUBTRACT: -+ return BLENDOP_SUBTRACT; -+ case PIPE_BLEND_REVERSE_SUBTRACT: -+ return BLENDOP_REVSUBTRACT; -+ case PIPE_BLEND_MIN: -+ return BLENDOP_MIN; -+ case PIPE_BLEND_MAX: -+ return BLENDOP_MAX; -+ default: -+ assert(0 && "Invalid blend func"); -+ return BLENDOP_ADD; -+ } -+} -+ -+static INLINE SWR_BLEND_FACTOR -+swr_convert_blend_factor(const UINT blend_factor) -+{ -+ switch (blend_factor) { -+ case PIPE_BLENDFACTOR_ONE: -+ return BLENDFACTOR_ONE; -+ case PIPE_BLENDFACTOR_SRC_COLOR: -+ return BLENDFACTOR_SRC_COLOR; -+ case PIPE_BLENDFACTOR_SRC_ALPHA: -+ return BLENDFACTOR_SRC_ALPHA; -+ case PIPE_BLENDFACTOR_DST_ALPHA: -+ return BLENDFACTOR_DST_ALPHA; -+ case PIPE_BLENDFACTOR_DST_COLOR: -+ return BLENDFACTOR_DST_COLOR; -+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: -+ return BLENDFACTOR_SRC_ALPHA_SATURATE; -+ case PIPE_BLENDFACTOR_CONST_COLOR: -+ return BLENDFACTOR_CONST_COLOR; -+ case PIPE_BLENDFACTOR_CONST_ALPHA: -+ return BLENDFACTOR_CONST_ALPHA; -+ case PIPE_BLENDFACTOR_SRC1_COLOR: -+ return BLENDFACTOR_SRC1_COLOR; -+ case PIPE_BLENDFACTOR_SRC1_ALPHA: -+ return BLENDFACTOR_SRC1_ALPHA; -+ case PIPE_BLENDFACTOR_ZERO: -+ return BLENDFACTOR_ZERO; -+ case PIPE_BLENDFACTOR_INV_SRC_COLOR: -+ return BLENDFACTOR_INV_SRC_COLOR; -+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA: -+ return BLENDFACTOR_INV_SRC_ALPHA; -+ case PIPE_BLENDFACTOR_INV_DST_ALPHA: -+ return BLENDFACTOR_INV_DST_ALPHA; -+ case PIPE_BLENDFACTOR_INV_DST_COLOR: -+ return BLENDFACTOR_INV_DST_COLOR; -+ case PIPE_BLENDFACTOR_INV_CONST_COLOR: -+ return BLENDFACTOR_INV_CONST_COLOR; -+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA: -+ return BLENDFACTOR_INV_CONST_ALPHA; -+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR: -+ return BLENDFACTOR_INV_SRC1_COLOR; -+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: -+ return BLENDFACTOR_INV_SRC1_ALPHA; -+ default: -+ assert(0 && "Invalid blend factor"); -+ return BLENDFACTOR_ONE; -+ } -+} -+#endif -diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp -new file mode 100644 -index 0000000..8e01e32 ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_tex_sample.cpp -@@ -0,0 +1,338 @@ -+/************************************************************************** -+ * -+ * Copyright 2009 VMware, Inc. -+ * All rights reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the -+ * "Software"), to deal in the Software without restriction, including -+ * without limitation the rights to use, copy, modify, merge, publish, -+ * distribute, sub license, and/or sell copies of the Software, and to -+ * permit persons to whom the Software is furnished to do so, subject to -+ * the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the -+ * next paragraph) shall be included in all copies or substantial portions -+ * of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. -+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR -+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -+ * -+ **************************************************************************/ -+ -+/** -+ * Largely a copy of llvmpipe's lp_tex_sample.c -+ */ -+ -+/** -+ * Texture sampling code generation -+ * -+ * This file is nothing more than ugly glue between three largely independent -+ * entities: -+ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa) -+ * - texture sampling code generation (i.e., lp_build_sample_soa) -+ * - SWR driver -+ * -+ * All interesting code is in the functions mentioned above. There is really -+ * nothing to see here. -+ * -+ * @author Jose Fonseca -+ */ -+ -+#include "state.h" -+#include "JitManager.h" -+#include "state_llvm.h" -+ -+#include "pipe/p_defines.h" -+#include "pipe/p_shader_tokens.h" -+#include "gallivm/lp_bld_debug.h" -+#include "gallivm/lp_bld_const.h" -+#include "gallivm/lp_bld_type.h" -+#include "gallivm/lp_bld_sample.h" -+#include "gallivm/lp_bld_tgsi.h" -+#include "util/u_memory.h" -+ -+#include "swr_tex_sample.h" -+#include "swr_context_llvm.h" -+ -+ -+/** -+ * This provides the bridge between the sampler state store in -+ * lp_jit_context and lp_jit_texture and the sampler code -+ * generator. It provides the texture layout information required by -+ * the texture sampler code generator in terms of the state stored in -+ * lp_jit_context and lp_jit_texture in runtime. -+ */ -+struct swr_sampler_dynamic_state { -+ struct lp_sampler_dynamic_state base; -+ -+ const struct swr_sampler_static_state *static_state; -+}; -+ -+ -+/** -+ * This is the bridge between our sampler and the TGSI translator. -+ */ -+struct swr_sampler_soa { -+ struct lp_build_sampler_soa base; -+ -+ struct swr_sampler_dynamic_state dynamic_state; -+}; -+ -+ -+/** -+ * Fetch the specified member of the lp_jit_texture structure. -+ * \param emit_load if TRUE, emit the LLVM load instruction to actually -+ * fetch the field's value. Otherwise, just emit the -+ * GEP code to address the field. -+ * -+ * @sa http://llvm.org/docs/GetElementPtr.html -+ */ -+static LLVMValueRef -+swr_texture_member(const struct lp_sampler_dynamic_state *base, -+ struct gallivm_state *gallivm, -+ LLVMValueRef context_ptr, -+ unsigned texture_unit, -+ unsigned member_index, -+ const char *member_name, -+ boolean emit_load) -+{ -+ LLVMBuilderRef builder = gallivm->builder; -+ LLVMValueRef indices[4]; -+ LLVMValueRef ptr; -+ LLVMValueRef res; -+ -+ assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS); -+ -+ /* context[0] */ -+ indices[0] = lp_build_const_int32(gallivm, 0); -+ /* context[0].textures */ -+ indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS); -+ /* context[0].textures[unit] */ -+ indices[2] = lp_build_const_int32(gallivm, texture_unit); -+ /* context[0].textures[unit].member */ -+ indices[3] = lp_build_const_int32(gallivm, member_index); -+ -+ ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), ""); -+ -+ if (emit_load) -+ res = LLVMBuildLoad(builder, ptr, ""); -+ else -+ res = ptr; -+ -+ lp_build_name(res, "context.texture%u.%s", texture_unit, member_name); -+ -+ return res; -+} -+ -+ -+/** -+ * Helper macro to instantiate the functions that generate the code to -+ * fetch the members of lp_jit_texture to fulfill the sampler code -+ * generator requests. -+ * -+ * This complexity is the price we have to pay to keep the texture -+ * sampler code generator a reusable module without dependencies to -+ * swr internals. -+ */ -+#define SWR_TEXTURE_MEMBER(_name, _emit_load) \ -+ static LLVMValueRef swr_texture_##_name( \ -+ const struct lp_sampler_dynamic_state *base, \ -+ struct gallivm_state *gallivm, \ -+ LLVMValueRef context_ptr, \ -+ unsigned texture_unit) \ -+ { \ -+ return swr_texture_member(base, \ -+ gallivm, \ -+ context_ptr, \ -+ texture_unit, \ -+ swr_jit_texture_##_name, \ -+ #_name, \ -+ _emit_load); \ -+ } -+ -+ -+SWR_TEXTURE_MEMBER(width, TRUE) -+SWR_TEXTURE_MEMBER(height, TRUE) -+SWR_TEXTURE_MEMBER(depth, TRUE) -+SWR_TEXTURE_MEMBER(first_level, TRUE) -+SWR_TEXTURE_MEMBER(last_level, TRUE) -+SWR_TEXTURE_MEMBER(base_ptr, TRUE) -+SWR_TEXTURE_MEMBER(row_stride, FALSE) -+SWR_TEXTURE_MEMBER(img_stride, FALSE) -+SWR_TEXTURE_MEMBER(mip_offsets, FALSE) -+ -+ -+/** -+ * Fetch the specified member of the lp_jit_sampler structure. -+ * \param emit_load if TRUE, emit the LLVM load instruction to actually -+ * fetch the field's value. Otherwise, just emit the -+ * GEP code to address the field. -+ * -+ * @sa http://llvm.org/docs/GetElementPtr.html -+ */ -+static LLVMValueRef -+swr_sampler_member(const struct lp_sampler_dynamic_state *base, -+ struct gallivm_state *gallivm, -+ LLVMValueRef context_ptr, -+ unsigned sampler_unit, -+ unsigned member_index, -+ const char *member_name, -+ boolean emit_load) -+{ -+ LLVMBuilderRef builder = gallivm->builder; -+ LLVMValueRef indices[4]; -+ LLVMValueRef ptr; -+ LLVMValueRef res; -+ -+ assert(sampler_unit < PIPE_MAX_SAMPLERS); -+ -+ /* context[0] */ -+ indices[0] = lp_build_const_int32(gallivm, 0); -+ /* context[0].samplers */ -+ indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS); -+ /* context[0].samplers[unit] */ -+ indices[2] = lp_build_const_int32(gallivm, sampler_unit); -+ /* context[0].samplers[unit].member */ -+ indices[3] = lp_build_const_int32(gallivm, member_index); -+ -+ ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), ""); -+ -+ if (emit_load) -+ res = LLVMBuildLoad(builder, ptr, ""); -+ else -+ res = ptr; -+ -+ lp_build_name(res, "context.sampler%u.%s", sampler_unit, member_name); -+ -+ return res; -+} -+ -+ -+#define SWR_SAMPLER_MEMBER(_name, _emit_load) \ -+ static LLVMValueRef swr_sampler_##_name( \ -+ const struct lp_sampler_dynamic_state *base, \ -+ struct gallivm_state *gallivm, \ -+ LLVMValueRef context_ptr, \ -+ unsigned sampler_unit) \ -+ { \ -+ return swr_sampler_member(base, \ -+ gallivm, \ -+ context_ptr, \ -+ sampler_unit, \ -+ swr_jit_sampler_##_name, \ -+ #_name, \ -+ _emit_load); \ -+ } -+ -+ -+SWR_SAMPLER_MEMBER(min_lod, TRUE) -+SWR_SAMPLER_MEMBER(max_lod, TRUE) -+SWR_SAMPLER_MEMBER(lod_bias, TRUE) -+SWR_SAMPLER_MEMBER(border_color, FALSE) -+ -+ -+static void -+swr_sampler_soa_destroy(struct lp_build_sampler_soa *sampler) -+{ -+ FREE(sampler); -+} -+ -+ -+/** -+ * Fetch filtered values from texture. -+ * The 'texel' parameter returns four vectors corresponding to R, G, B, A. -+ */ -+static void -+swr_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, -+ struct gallivm_state *gallivm, -+ const struct lp_sampler_params *params) -+{ -+ struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base; -+ unsigned texture_index = params->texture_index; -+ unsigned sampler_index = params->sampler_index; -+ -+ assert(sampler_index < PIPE_MAX_SAMPLERS); -+ assert(texture_index < PIPE_MAX_SHADER_SAMPLER_VIEWS); -+ -+#if 0 -+ lp_build_sample_nop(gallivm, params->type, params->coords, params->texel); -+#else -+ lp_build_sample_soa( -+ &sampler->dynamic_state.static_state[texture_index].texture_state, -+ &sampler->dynamic_state.static_state[sampler_index].sampler_state, -+ &sampler->dynamic_state.base, -+ gallivm, -+ params); -+#endif -+} -+ -+/** -+ * Fetch the texture size. -+ */ -+static void -+swr_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base, -+ struct gallivm_state *gallivm, -+ struct lp_type type, -+ unsigned texture_unit, -+ unsigned target, -+ LLVMValueRef context_ptr, -+ boolean is_sviewinfo, -+ enum lp_sampler_lod_property lod_property, -+ LLVMValueRef explicit_lod, /* optional */ -+ LLVMValueRef *sizes_out) -+{ -+ struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base; -+ -+ assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS); -+ -+ lp_build_size_query_soa( -+ gallivm, -+ &sampler->dynamic_state.static_state[texture_unit].texture_state, -+ &sampler->dynamic_state.base, -+ type, -+ texture_unit, -+ target, -+ context_ptr, -+ is_sviewinfo, -+ lod_property, -+ explicit_lod, -+ sizes_out); -+} -+ -+ -+struct lp_build_sampler_soa * -+swr_sampler_soa_create(const struct swr_sampler_static_state *static_state) -+{ -+ struct swr_sampler_soa *sampler; -+ -+ sampler = CALLOC_STRUCT(swr_sampler_soa); -+ if (!sampler) -+ return NULL; -+ -+ sampler->base.destroy = swr_sampler_soa_destroy; -+ sampler->base.emit_tex_sample = swr_sampler_soa_emit_fetch_texel; -+ sampler->base.emit_size_query = swr_sampler_soa_emit_size_query; -+ sampler->dynamic_state.base.width = swr_texture_width; -+ sampler->dynamic_state.base.height = swr_texture_height; -+ sampler->dynamic_state.base.depth = swr_texture_depth; -+ sampler->dynamic_state.base.first_level = swr_texture_first_level; -+ sampler->dynamic_state.base.last_level = swr_texture_last_level; -+ sampler->dynamic_state.base.base_ptr = swr_texture_base_ptr; -+ sampler->dynamic_state.base.row_stride = swr_texture_row_stride; -+ sampler->dynamic_state.base.img_stride = swr_texture_img_stride; -+ sampler->dynamic_state.base.mip_offsets = swr_texture_mip_offsets; -+ sampler->dynamic_state.base.min_lod = swr_sampler_min_lod; -+ sampler->dynamic_state.base.max_lod = swr_sampler_max_lod; -+ sampler->dynamic_state.base.lod_bias = swr_sampler_lod_bias; -+ sampler->dynamic_state.base.border_color = swr_sampler_border_color; -+ -+ sampler->dynamic_state.static_state = static_state; -+ -+ return &sampler->base; -+} -diff --git a/src/gallium/drivers/swr/swr_tex_sample.h b/src/gallium/drivers/swr/swr_tex_sample.h -new file mode 100644 -index 0000000..f5c368c ---- /dev/null -+++ b/src/gallium/drivers/swr/swr_tex_sample.h -@@ -0,0 +1,47 @@ -+/************************************************************************** -+ * -+ * Copyright 2007 VMware, Inc. -+ * All Rights Reserved. -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the -+ * "Software"), to deal in the Software without restriction, including -+ * without limitation the rights to use, copy, modify, merge, publish, -+ * distribute, sub license, and/or sell copies of the Software, and to -+ * permit persons to whom the Software is furnished to do so, subject to -+ * the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the -+ * next paragraph) shall be included in all copies or substantial portions -+ * of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. -+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR -+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -+ * -+ **************************************************************************/ -+ -+#pragma once -+ -+#include "gallivm/lp_bld.h" -+ -+struct swr_sampler_static_state { -+ /* -+ * These attributes are effectively interleaved for more sane key handling. -+ * However, there might be lots of null space if the amount of samplers and -+ * textures isn't the same. -+ */ -+ struct lp_static_sampler_state sampler_state; -+ struct lp_static_texture_state texture_state; -+}; -+ -+/** -+ * Pure-LLVM texture sampling code generator. -+ * -+ */ -+struct lp_build_sampler_soa * -+swr_sampler_soa_create(const struct swr_sampler_static_state *key); -diff --git a/src/gallium/targets/libgl-xlib/Makefile.am b/src/gallium/targets/libgl-xlib/Makefile.am -index d99caae..527d01b 100644 ---- a/src/gallium/targets/libgl-xlib/Makefile.am -+++ b/src/gallium/targets/libgl-xlib/Makefile.am -@@ -84,4 +84,9 @@ endif - EXTRA_lib@GL_LIB@_la_DEPENDENCIES = libgl-xlib.sym - EXTRA_DIST = SConscript libgl-xlib.sym - -+if HAVE_GALLIUM_SWR -+lib@GL_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS) -+AM_CPPFLAGS += -DGALLIUM_SWR -+endif -+ - include $(top_srcdir)/install-gallium-links.mk -diff --git a/src/gallium/targets/libgl-xlib/SConscript b/src/gallium/targets/libgl-xlib/SConscript -index df5a220..da77ad5 100644 ---- a/src/gallium/targets/libgl-xlib/SConscript -+++ b/src/gallium/targets/libgl-xlib/SConscript -@@ -46,6 +46,10 @@ if env['llvm']: - env.Append(CPPDEFINES = ['GALLIUM_LLVMPIPE']) - env.Prepend(LIBS = [llvmpipe]) - -+if env['llvm']: -+ env.Append(CPPDEFINES = ['GALLIUM_SWR']) -+ env.Prepend(LIBS = [swr]) -+ - # Disallow undefined symbols - if env['platform'] != 'darwin': - env.Append(SHLINKFLAGS = ['-Wl,-z,defs']) -diff --git a/src/gallium/targets/osmesa/Makefile.am b/src/gallium/targets/osmesa/Makefile.am -index 38e515f..5d39486 100644 ---- a/src/gallium/targets/osmesa/Makefile.am -+++ b/src/gallium/targets/osmesa/Makefile.am -@@ -74,6 +74,12 @@ lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS) - lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la $(LLVM_LIBS) - endif - -+if HAVE_GALLIUM_SWR -+AM_CPPFLAGS += -DGALLIUM_SWR -+lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS) -+lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS) -+endif -+ - EXTRA_lib@OSMESA_LIB@_la_DEPENDENCIES = osmesa.sym - EXTRA_DIST = \ - osmesa.sym \ --- -2.6.2 - diff --git a/0002-swr-484541-Initial-public-SWR.patch b/0002-swr-484541-Initial-public-SWR.patch deleted file mode 100644 index c43d9c0..0000000 --- a/0002-swr-484541-Initial-public-SWR.patch +++ /dev/null @@ -1,46197 +0,0 @@ -From 378e7aa8e96eb976aa4fe8cea6e522c3c2566031 Mon Sep 17 00:00:00 2001 -From: Tim Rowley -Date: Mon, 19 Oct 2015 13:34:59 -0500 -Subject: [PATCH 2/3] swr-484541: Initial public SWR - ---- - .../drivers/swr/rasterizer/common/containers.hpp | 208 + - .../drivers/swr/rasterizer/common/formats.cpp | 5029 ++++++++++++++++++++ - .../drivers/swr/rasterizer/common/formats.h | 222 + - src/gallium/drivers/swr/rasterizer/common/isa.hpp | 235 + - src/gallium/drivers/swr/rasterizer/common/os.h | 194 + - .../swr/rasterizer/common/rdtsc_buckets.cpp | 176 + - .../drivers/swr/rasterizer/common/rdtsc_buckets.h | 195 + - .../swr/rasterizer/common/rdtsc_buckets_shared.h | 167 + - .../drivers/swr/rasterizer/common/simdintrin.h | 792 +++ - .../drivers/swr/rasterizer/common/swr_assert.cpp | 141 + - .../drivers/swr/rasterizer/common/swr_assert.h | 84 + - src/gallium/drivers/swr/rasterizer/core/api.cpp | 1461 ++++++ - src/gallium/drivers/swr/rasterizer/core/api.h | 483 ++ - src/gallium/drivers/swr/rasterizer/core/arena.cpp | 126 + - src/gallium/drivers/swr/rasterizer/core/arena.h | 63 + - .../drivers/swr/rasterizer/core/backend.cpp | 1150 +++++ - src/gallium/drivers/swr/rasterizer/core/backend.h | 45 + - src/gallium/drivers/swr/rasterizer/core/blend.h | 318 ++ - src/gallium/drivers/swr/rasterizer/core/clip.cpp | 201 + - src/gallium/drivers/swr/rasterizer/core/clip.h | 851 ++++ - src/gallium/drivers/swr/rasterizer/core/context.h | 444 ++ - .../drivers/swr/rasterizer/core/depthstencil.h | 215 + - src/gallium/drivers/swr/rasterizer/core/fifo.hpp | 144 + - .../swr/rasterizer/core/format_conversion.h | 167 + - .../drivers/swr/rasterizer/core/format_traits.h | 2954 ++++++++++++ - .../drivers/swr/rasterizer/core/format_types.h | 1053 ++++ - .../drivers/swr/rasterizer/core/frontend.cpp | 1972 ++++++++ - src/gallium/drivers/swr/rasterizer/core/frontend.h | 326 ++ - src/gallium/drivers/swr/rasterizer/core/knobs.h | 139 + - .../drivers/swr/rasterizer/core/knobs_init.h | 98 + - .../drivers/swr/rasterizer/core/multisample.h | 562 +++ - src/gallium/drivers/swr/rasterizer/core/pa.h | 1205 +++++ - src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 1330 ++++++ - .../drivers/swr/rasterizer/core/rasterizer.cpp | 1217 +++++ - .../drivers/swr/rasterizer/core/rasterizer.h | 34 + - .../drivers/swr/rasterizer/core/rdtsc_core.cpp | 90 + - .../drivers/swr/rasterizer/core/rdtsc_core.h | 175 + - src/gallium/drivers/swr/rasterizer/core/state.h | 918 ++++ - .../drivers/swr/rasterizer/core/tessellator.h | 88 + - .../drivers/swr/rasterizer/core/threads.cpp | 884 ++++ - src/gallium/drivers/swr/rasterizer/core/threads.h | 62 + - .../drivers/swr/rasterizer/core/tilemgr.cpp | 105 + - src/gallium/drivers/swr/rasterizer/core/tilemgr.h | 392 ++ - src/gallium/drivers/swr/rasterizer/core/utils.cpp | 148 + - src/gallium/drivers/swr/rasterizer/core/utils.h | 745 +++ - .../drivers/swr/rasterizer/jitter/JitManager.cpp | 292 ++ - .../drivers/swr/rasterizer/jitter/JitManager.h | 182 + - .../drivers/swr/rasterizer/jitter/blend_jit.cpp | 473 ++ - .../drivers/swr/rasterizer/jitter/blend_jit.h | 49 + - .../drivers/swr/rasterizer/jitter/builder.cpp | 56 + - .../drivers/swr/rasterizer/jitter/builder.h | 66 + - .../drivers/swr/rasterizer/jitter/builder_gen.cpp | 1052 ++++ - .../drivers/swr/rasterizer/jitter/builder_gen.h | 205 + - .../drivers/swr/rasterizer/jitter/builder_math.h | 34 + - .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 1195 +++++ - .../drivers/swr/rasterizer/jitter/builder_misc.h | 141 + - .../drivers/swr/rasterizer/jitter/builder_x86.cpp | 242 + - .../drivers/swr/rasterizer/jitter/builder_x86.h | 65 + - .../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 1450 ++++++ - .../drivers/swr/rasterizer/jitter/fetch_jit.h | 128 + - .../drivers/swr/rasterizer/jitter/jit_api.h | 105 + - .../rasterizer/jitter/scripts/gen_llvm_types.py | 334 ++ - .../swr/rasterizer/jitter/streamout_jit.cpp | 348 ++ - .../drivers/swr/rasterizer/jitter/streamout_jit.h | 91 + - .../drivers/swr/rasterizer/memory/ClearTile.cpp | 287 ++ - .../drivers/swr/rasterizer/memory/Convert.h | 698 +++ - .../drivers/swr/rasterizer/memory/LoadTile.cpp | 382 ++ - .../drivers/swr/rasterizer/memory/StoreTile.cpp | 1645 +++++++ - .../swr/rasterizer/memory/TilingFunctions.h | 518 ++ - .../drivers/swr/rasterizer/memory/tilingtraits.h | 239 + - .../drivers/swr/rasterizer/scripts/gen_knobs.py | 79 + - .../drivers/swr/rasterizer/scripts/knob_defs.py | 212 + - .../swr/rasterizer/scripts/mako/__init__.py | 8 + - .../swr/rasterizer/scripts/mako/_ast_util.py | 845 ++++ - .../drivers/swr/rasterizer/scripts/mako/ast.py | 178 + - .../drivers/swr/rasterizer/scripts/mako/cache.py | 238 + - .../drivers/swr/rasterizer/scripts/mako/cmd.py | 62 + - .../drivers/swr/rasterizer/scripts/mako/codegen.py | 1237 +++++ - .../drivers/swr/rasterizer/scripts/mako/compat.py | 174 + - .../swr/rasterizer/scripts/mako/exceptions.py | 373 ++ - .../drivers/swr/rasterizer/scripts/mako/filters.py | 201 + - .../drivers/swr/rasterizer/scripts/mako/lexer.py | 441 ++ - .../drivers/swr/rasterizer/scripts/mako/lookup.py | 359 ++ - .../swr/rasterizer/scripts/mako/parsetree.py | 594 +++ - .../drivers/swr/rasterizer/scripts/mako/pygen.py | 299 ++ - .../swr/rasterizer/scripts/mako/pyparser.py | 232 + - .../drivers/swr/rasterizer/scripts/mako/runtime.py | 878 ++++ - .../swr/rasterizer/scripts/mako/template.py | 705 +++ - .../drivers/swr/rasterizer/scripts/mako/util.py | 360 ++ - .../rasterizer/scripts/templates/knobs.template | 106 + - 90 files changed, 45466 insertions(+) - create mode 100644 src/gallium/drivers/swr/rasterizer/common/containers.hpp - create mode 100644 src/gallium/drivers/swr/rasterizer/common/formats.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/common/formats.h - create mode 100644 src/gallium/drivers/swr/rasterizer/common/isa.hpp - create mode 100644 src/gallium/drivers/swr/rasterizer/common/os.h - create mode 100644 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h - create mode 100644 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h - create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdintrin.h - create mode 100644 src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/common/swr_assert.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/api.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/api.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/arena.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/arena.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/blend.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/clip.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/clip.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/context.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/depthstencil.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/fifo.hpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_conversion.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_traits.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_types.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/frontend.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/frontend.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/knobs.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/knobs_init.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/multisample.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/pa.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/rasterizer.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/state.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/tessellator.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/threads.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/threads.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/tilemgr.h - create mode 100644 src/gallium/drivers/swr/rasterizer/core/utils.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/core/utils.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/JitManager.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_math.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/jit_api.h - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h - create mode 100644 src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/memory/Convert.h - create mode 100644 src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp - create mode 100644 src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h - create mode 100644 src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/template.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/util.py - create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template - -diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp -new file mode 100644 -index 0000000..bc96c5f ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp -@@ -0,0 +1,208 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+****************************************************************************/ -+ -+#ifndef SWRLIB_CONTAINERS_HPP__ -+#define SWRLIB_CONTAINERS_HPP__ -+ -+#include -+#include "common/os.h" -+ -+namespace SWRL -+{ -+ -+template -+struct UncheckedFixedVector -+{ -+ UncheckedFixedVector() : mSize(0) -+ { -+ } -+ -+ UncheckedFixedVector(std::size_t size, T const& exemplar) -+ { -+ this->mSize = 0; -+ for (std::size_t i = 0; i < size; ++i) -+ this->push_back(exemplar); -+ } -+ -+ template -+ UncheckedFixedVector(Iter fst, Iter lst) -+ { -+ this->mSize = 0; -+ for ( ; fst != lst; ++fst) -+ this->push_back(*fst); -+ } -+ -+ UncheckedFixedVector(UncheckedFixedVector const& UFV) -+ { -+ this->mSize = 0; -+ for (std::size_t i = 0, N = UFV.size(); i < N; ++i) -+ (*this)[i] = UFV[i]; -+ this->mSize = UFV.size(); -+ } -+ -+ UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV) -+ { -+ for (std::size_t i = 0, N = UFV.size(); i < N; ++i) -+ (*this)[i] = UFV[i]; -+ this->mSize = UFV.size(); -+ return *this; -+ } -+ -+ T* begin() { return &this->mElements[0]; } -+ T* end() { return &this->mElements[0] + this->mSize; } -+ T const* begin() const { return &this->mElements[0]; } -+ T const* end() const { return &this->mElements[0] + this->mSize; } -+ -+ friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R) -+ { -+ if (L.size() != R.size()) return false; -+ for (std::size_t i = 0, N = L.size(); i < N; ++i) -+ { -+ if (L[i] != R[i]) return false; -+ } -+ return true; -+ } -+ -+ friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R) -+ { -+ if (L.size() != R.size()) return true; -+ for (std::size_t i = 0, N = L.size(); i < N; ++i) -+ { -+ if (L[i] != R[i]) return true; -+ } -+ return false; -+ } -+ -+ T& operator[](std::size_t idx) -+ { -+ return this->mElements[idx]; -+ } -+ T const& operator[](std::size_t idx) const -+ { -+ return this->mElements[idx]; -+ } -+ void push_back(T const& t) -+ { -+ this->mElements[this->mSize] = t; -+ ++this->mSize; -+ } -+ void pop_back() -+ { -+ SWR_ASSERT(this->mSize > 0); -+ --this->mSize; -+ } -+ T& back() -+ { -+ return this->mElements[this->mSize-1]; -+ } -+ T const& back() const -+ { -+ return this->mElements[this->mSize-1]; -+ } -+ bool empty() const -+ { -+ return this->mSize == 0; -+ } -+ std::size_t size() const -+ { -+ return this->mSize; -+ } -+ void resize(std::size_t sz) -+ { -+ this->mSize = sz; -+ } -+ void clear() -+ { -+ this->resize(0); -+ } -+private: -+ std::size_t mSize; -+ T mElements[NUM_ELEMENTS]; -+}; -+ -+template -+struct FixedStack : UncheckedFixedVector -+{ -+ FixedStack() {} -+ -+ void push(T const& t) -+ { -+ this->push_back(t); -+ } -+ -+ void pop() -+ { -+ this->pop_back(); -+ } -+ -+ T& top() -+ { -+ return this->back(); -+ } -+ -+ T const& top() const -+ { -+ return this->back(); -+ } -+}; -+ -+template -+struct CRCHash -+{ -+ static_assert((sizeof(T) % sizeof(UINT)) == 0, "CRCHash expects templated type size is even multiple of 4B"); -+ UINT operator()(const T& k) const -+ { -+ UINT *pData = (UINT*)&k; -+ UINT crc = 0; -+ for (UINT i = 0; i < sizeof(T) / sizeof(UINT); ++i) -+ { -+ crc = _mm_crc32_u32(crc, pData[i]); -+ } -+ return crc; -+ } -+}; -+ -+}// end SWRL -+ -+namespace std -+{ -+ -+template -+struct hash> -+{ -+ size_t operator() (SWRL::UncheckedFixedVector const& v) const -+ { -+ if (v.size() == 0) return 0; -+ std::hash H; -+ size_t x = H(v[0]); -+ if (v.size() == 1) return x; -+ for (size_t i = 1; i < v.size(); ++i) -+ x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2); -+ return x; -+ } -+}; -+ -+ -+}// end std. -+ -+#endif//SWRLIB_CONTAINERS_HPP__ -diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp -new file mode 100644 -index 0000000..7e90ee7 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp -@@ -0,0 +1,5029 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file formats.cpp -+* -+* @brief auto-generated file -+* -+* DO NOT EDIT -+* -+******************************************************************************/ -+ -+#include "formats.h" -+ -+// lookup table for unorm8 srgb -> float conversion -+const uint32_t srgb8Table[256] = { -+ 0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40f, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40f, 0x3b0b3e5e, 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518d, 0x3b70f18d, 0x3b83e1c6, 0x3b8fe616, 0x3b9c87fd, -+ 0x3ba9c9b5, 0x3bb7ad6f, 0x3bc63549, 0x3bd5635f, 0x3be539c1, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152, 0x3c15a703, 0x3c1f45be, 0x3c293e6b, 0x3c3391f7, 0x3c3e4149, 0x3c494d43, 0x3c54b6c7, 0x3c607eb1, -+ 0x3c6ca5dc, 0x3c792d22, 0x3c830aa8, 0x3c89af9f, 0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63431, 0x3cadd37d, 0x3cb5a601, 0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d3, 0x3cdfd00e, 0x3ce8ddb9, -+ 0x3cf22131, 0x3cfb9ac6, 0x3d02a56c, 0x3d0798df, 0x3d0ca7e7, 0x3d11d2b0, 0x3d171965, 0x3d1c7c31, 0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebe, 0x3d332384, 0x3d39152e, 0x3d3f23e6, 0x3d454fd4, 0x3d4b991f, -+ 0x3d51ffef, 0x3d58846a, 0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c20f, 0x3d7add25, 0x3d810b66, 0x3d84b795, 0x3d887330, 0x3d8c3e4a, 0x3d9018f6, 0x3d940345, 0x3d97fd4a, 0x3d9c0716, 0x3da020bb, -+ 0x3da44a4b, 0x3da883d7, 0x3daccd70, 0x3db12728, 0x3db59110, 0x3dba0b38, 0x3dbe95b5, 0x3dc33092, 0x3dc7dbe2, 0x3dcc97b6, 0x3dd1641f, 0x3dd6412c, 0x3ddb2eef, 0x3de02d77, 0x3de53cd5, 0x3dea5d19, -+ 0x3def8e55, 0x3df4d093, 0x3dfa23e8, 0x3dff8861, 0x3e027f07, 0x3e054282, 0x3e080ea5, 0x3e0ae379, 0x3e0dc107, 0x3e10a755, 0x3e13966c, 0x3e168e53, 0x3e198f11, 0x3e1c98ae, 0x3e1fab32, 0x3e22c6a3, -+ 0x3e25eb09, 0x3e29186c, 0x3e2c4ed2, 0x3e2f8e45, 0x3e32d6c8, 0x3e362865, 0x3e398322, 0x3e3ce706, 0x3e405419, 0x3e43ca62, 0x3e4749e8, 0x3e4ad2b1, 0x3e4e64c6, 0x3e52002b, 0x3e55a4e9, 0x3e595307, -+ 0x3e5d0a8b, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf, 0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, 0x3e7c1c38, 0x3e8014c2, 0x3e82203c, 0x3e84308d, 0x3e8645ba, 0x3e885fc5, 0x3e8a7eb2, 0x3e8ca283, -+ 0x3e8ecb3d, 0x3e90f8e1, 0x3e932b74, 0x3e9562f8, 0x3e979f71, 0x3e99e0e2, 0x3e9c274e, 0x3e9e72b7, 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d289, 0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18333, -+ 0x3eb3fc16, 0x3eb67a15, 0x3eb8fd34, 0x3ebb8576, 0x3ebe12e1, 0x3ec0a571, 0x3ec33d2d, 0x3ec5da17, 0x3ec87c33, 0x3ecb2383, 0x3ecdd00b, 0x3ed081cd, 0x3ed338cc, 0x3ed5f50b, 0x3ed8b68d, 0x3edb7d54, -+ 0x3ede4965, 0x3ee11ac1, 0x3ee3f16b, 0x3ee6cd67, 0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, 0x3ef56976, 0x3ef86594, 0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8, -+ 0x3f06f106, 0x3f0884cf, 0x3f0a1b57, 0x3f0bb49d, 0x3f0d50a2, 0x3f0eef69, 0x3f1090f2, 0x3f123540, 0x3f13dc53, 0x3f15862d, 0x3f1732cf, 0x3f18e23b, 0x3f1a9471, 0x3f1c4973, 0x3f1e0143, 0x3f1fbbe1, -+ 0x3f217950, 0x3f23398f, 0x3f24fca2, 0x3f26c288, 0x3f288b43, 0x3f2a56d5, 0x3f2c253f, 0x3f2df681, 0x3f2fca9e, 0x3f31a197, 0x3f337b6c, 0x3f355820, 0x3f3737b3, 0x3f391a26, 0x3f3aff7e, 0x3f3ce7b7, -+ 0x3f3ed2d4, 0x3f40c0d6, 0x3f42b1c0, 0x3f44a592, 0x3f469c4d, 0x3f4895f3, 0x3f4a9284, 0x3f4c9203, 0x3f4e9470, 0x3f5099cd, 0x3f52a21a, 0x3f54ad59, 0x3f56bb8c, 0x3f58ccb3, 0x3f5ae0cf, 0x3f5cf7e2, -+ 0x3f5f11ee, 0x3f612ef2, 0x3f634eef, 0x3f6571ec, 0x3f6797e1, 0x3f69c0d8, 0x3f6beccb, 0x3f6e1bc2, 0x3f704db6, 0x3f7282b1, 0x3f74baae, 0x3f76f5b3, 0x3f7933b9, 0x3f7b74cb, 0x3f7db8e0, 0x3f800000, -+}; -+ -+// order must match SWR_FORMAT -+const SWR_FORMAT_INFO gFormatInfo[] = { -+ // R32G32B32A32_FLOAT (0x0) -+ { -+ "R32G32B32A32_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 32, 32, 32, 32 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32B32A32_SINT (0x1) -+ { -+ "R32G32B32A32_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 32, 32, 32, 32 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32B32A32_UINT (0x2) -+ { -+ "R32G32B32A32_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 32, 32, 32, 32 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x3 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x4 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x5 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R32G32B32X32_FLOAT (0x6) -+ { -+ "R32G32B32X32_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 32, 32, 32, 32 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32B32A32_SSCALED (0x7) -+ { -+ "R32G32B32A32_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 32, 32, 32, 32 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32B32A32_USCALED (0x8) -+ { -+ "R32G32B32A32_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 32, 32, 32, 32 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x9 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xc (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xd (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xe (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xf (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x10 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x11 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x12 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x13 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x14 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x15 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x16 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x17 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x18 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x19 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x20 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x21 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x22 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x23 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x24 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x25 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x26 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x27 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x28 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x29 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x2a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x2b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x2c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x2d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x2e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x2f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x30 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x31 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x32 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x33 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x34 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x35 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x36 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x37 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x38 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x39 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x3a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x3b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x3c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x3d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x3e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x3f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R32G32B32_FLOAT (0x40) -+ { -+ "R32G32B32_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 32, 32, 32, 0 }, // Bits per component -+ 96, // Bits per element -+ 12, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32B32_SINT (0x41) -+ { -+ "R32G32B32_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 32, 32, 32, 0 }, // Bits per component -+ 96, // Bits per element -+ 12, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32B32_UINT (0x42) -+ { -+ "R32G32B32_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 32, 32, 32, 0 }, // Bits per component -+ 96, // Bits per element -+ 12, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x43 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x44 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R32G32B32_SSCALED (0x45) -+ { -+ "R32G32B32_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 32, 32, 32, 0 }, // Bits per component -+ 96, // Bits per element -+ 12, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32B32_USCALED (0x46) -+ { -+ "R32G32B32_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 32, 32, 32, 0 }, // Bits per component -+ 96, // Bits per element -+ 12, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x47 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x48 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x49 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x4a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x4b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x4c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x4d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x4e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x4f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x50 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x51 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x52 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x53 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x54 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x55 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x56 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x57 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x58 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x59 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x5a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x5b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x5c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x5d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x5e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x5f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x60 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x61 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x62 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x63 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x64 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x65 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x66 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x67 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x68 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x69 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x6a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x6b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x6c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x6d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x6e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x6f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x70 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x71 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x72 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x73 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x74 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x75 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x76 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x77 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x78 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x79 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x7a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x7b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x7c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x7d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x7e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x7f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R16G16B16A16_UNORM (0x80) -+ { -+ "R16G16B16A16_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 16, 16, 16, 16 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16A16_SNORM (0x81) -+ { -+ "R16G16B16A16_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 16, 16, 16, 16 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16A16_SINT (0x82) -+ { -+ "R16G16B16A16_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 16, 16, 16, 16 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16A16_UINT (0x83) -+ { -+ "R16G16B16A16_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 16, 16, 16, 16 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16A16_FLOAT (0x84) -+ { -+ "R16G16B16A16_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 16, 16, 16, 16 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32_FLOAT (0x85) -+ { -+ "R32G32_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 32, 32, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32_SINT (0x86) -+ { -+ "R32G32_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 32, 32, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32_UINT (0x87) -+ { -+ "R32G32_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 32, 32, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32_FLOAT_X8X24_TYPELESS (0x88) -+ { -+ "R32_FLOAT_X8X24_TYPELESS", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 32, 32, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x89 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x8a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x8b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x8c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x8d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R16G16B16X16_UNORM (0x8e) -+ { -+ "R16G16B16X16_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 16, 16, 16, 16 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16X16_FLOAT (0x8f) -+ { -+ "R16G16B16X16_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 16, 16, 16, 16 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x90 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x91 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x92 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R16G16B16A16_SSCALED (0x93) -+ { -+ "R16G16B16A16_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 16, 16, 16, 16 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16A16_USCALED (0x94) -+ { -+ "R16G16B16A16_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 16, 16, 16, 16 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32_SSCALED (0x95) -+ { -+ "R32G32_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 32, 32, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32G32_USCALED (0x96) -+ { -+ "R32G32_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 32, 32, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x97 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R32_FLOAT_X8X24_TYPELESS_LD (0x98) -+ { -+ "R32_FLOAT_X8X24_TYPELESS_LD", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 32, 32, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x99 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x9a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x9b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x9c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x9d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x9e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x9f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa0 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa1 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa2 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa3 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa4 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa5 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa6 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa7 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa8 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xa9 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xaa (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xab (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xac (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xad (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xae (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xaf (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb0 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb1 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb2 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb3 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb4 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb5 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb6 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb7 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb8 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xb9 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xba (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xbb (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xbc (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xbd (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xbe (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xbf (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // B8G8R8A8_UNORM (0xc0) -+ { -+ "B8G8R8A8_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B8G8R8A8_UNORM_SRGB (0xc1) -+ { -+ "B8G8R8A8_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R10G10B10A2_UNORM (0xc2) -+ { -+ "R10G10B10A2_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R10G10B10A2_UNORM_SRGB (0xc3) -+ { -+ "R10G10B10A2_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R10G10B10A2_UINT (0xc4) -+ { -+ "R10G10B10A2_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0xc5 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xc6 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R8G8B8A8_UNORM (0xc7) -+ { -+ "R8G8B8A8_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8A8_UNORM_SRGB (0xc8) -+ { -+ "R8G8B8A8_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8A8_SNORM (0xc9) -+ { -+ "R8G8B8A8_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8A8_SINT (0xca) -+ { -+ "R8G8B8A8_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8A8_UINT (0xcb) -+ { -+ "R8G8B8A8_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16_UNORM (0xcc) -+ { -+ "R16G16_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 16, 16, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16_SNORM (0xcd) -+ { -+ "R16G16_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 16, 16, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16_SINT (0xce) -+ { -+ "R16G16_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 16, 16, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16_UINT (0xcf) -+ { -+ "R16G16_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 16, 16, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16_FLOAT (0xd0) -+ { -+ "R16G16_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 16, 16, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B10G10R10A2_UNORM (0xd1) -+ { -+ "B10G10R10A2_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B10G10R10A2_UNORM_SRGB (0xd2) -+ { -+ "B10G10R10A2_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R11G11B10_FLOAT (0xd3) -+ { -+ "R11G11B10_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 11, 11, 10, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0xd4 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xd5 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R32_SINT (0xd6) -+ { -+ "R32_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 32, 0, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32_UINT (0xd7) -+ { -+ "R32_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 32, 0, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32_FLOAT (0xd8) -+ { -+ "R32_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 32, 0, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R24_UNORM_X8_TYPELESS (0xd9) -+ { -+ "R24_UNORM_X8_TYPELESS", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 24, 0, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0xda (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xdb (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R24_UNORM_X8_TYPELESS_LD (0xdc) -+ { -+ "R24_UNORM_X8_TYPELESS_LD", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 24, 0, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0xdd (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xde (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xdf (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xe0 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xe1 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xe2 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xe3 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xe4 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // A32_FLOAT (0xe5) -+ { -+ "A32_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 3, 0, 0, 0 }, // Swizzle -+ { 32, 0, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0xe6 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xe7 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xe8 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // B8G8R8X8_UNORM (0xe9) -+ { -+ "B8G8R8X8_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B8G8R8X8_UNORM_SRGB (0xea) -+ { -+ "B8G8R8X8_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8X8_UNORM (0xeb) -+ { -+ "R8G8B8X8_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8X8_UNORM_SRGB (0xec) -+ { -+ "R8G8B8X8_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R9G9B9E5_SHAREDEXP (0xed) -+ { -+ "R9G9B9E5_SHAREDEXP", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 9, 9, 9, 5 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B10G10R10X2_UNORM (0xee) -+ { -+ "B10G10R10X2_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0xef (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xf0 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xf1 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xf2 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R10G10B10X2_USCALED (0xf3) -+ { -+ "R10G10B10X2_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8A8_SSCALED (0xf4) -+ { -+ "R8G8B8A8_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8A8_USCALED (0xf5) -+ { -+ "R8G8B8A8_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16_SSCALED (0xf6) -+ { -+ "R16G16_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 16, 16, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16_USCALED (0xf7) -+ { -+ "R16G16_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 16, 16, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32_SSCALED (0xf8) -+ { -+ "R32_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 32, 0, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R32_USCALED (0xf9) -+ { -+ "R32_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 32, 0, 0, 0 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0xfa (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xfb (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xfc (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xfd (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xfe (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0xff (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // B5G6R5_UNORM (0x100) -+ { -+ "B5G6R5_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 0 }, // Swizzle -+ { 5, 6, 5, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B5G6R5_UNORM_SRGB (0x101) -+ { -+ "B5G6R5_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 0 }, // Swizzle -+ { 5, 6, 5, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 3, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B5G5R5A1_UNORM (0x102) -+ { -+ "B5G5R5A1_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 5, 5, 5, 1 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B5G5R5A1_UNORM_SRGB (0x103) -+ { -+ "B5G5R5A1_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 5, 5, 5, 1 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 4, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B4G4R4A4_UNORM (0x104) -+ { -+ "B4G4R4A4_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 4, 4, 4, 4 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B4G4R4A4_UNORM_SRGB (0x105) -+ { -+ "B4G4R4A4_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 4, 4, 4, 4 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 4, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8_UNORM (0x106) -+ { -+ "R8G8_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 8, 8, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8_SNORM (0x107) -+ { -+ "R8G8_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 8, 8, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 127.0f, 1.0f / 127.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8_SINT (0x108) -+ { -+ "R8G8_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 8, 8, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8_UINT (0x109) -+ { -+ "R8G8_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 8, 8, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16_UNORM (0x10a) -+ { -+ "R16_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 16, 0, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16_SNORM (0x10b) -+ { -+ "R16_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 16, 0, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 32767.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16_SINT (0x10c) -+ { -+ "R16_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 16, 0, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16_UINT (0x10d) -+ { -+ "R16_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 16, 0, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16_FLOAT (0x10e) -+ { -+ "R16_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 16, 0, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x10f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x110 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x111 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x112 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // A16_UNORM (0x113) -+ { -+ "A16_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 3, 0, 0, 0 }, // Swizzle -+ { 16, 0, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x114 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x115 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x116 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // A16_FLOAT (0x117) -+ { -+ "A16_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 3, 0, 0, 0 }, // Swizzle -+ { 16, 0, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x118 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x119 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // B5G5R5X1_UNORM (0x11a) -+ { -+ "B5G5R5X1_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 5, 5, 5, 1 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B5G5R5X1_UNORM_SRGB (0x11b) -+ { -+ "B5G5R5X1_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 5, 5, 5, 1 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 4, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8_SSCALED (0x11c) -+ { -+ "R8G8_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 8, 8, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8_USCALED (0x11d) -+ { -+ "R8G8_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 0, 0 }, // Swizzle -+ { 8, 8, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 2, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16_SSCALED (0x11e) -+ { -+ "R16_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 16, 0, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16_USCALED (0x11f) -+ { -+ "R16_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 16, 0, 0, 0 }, // Bits per component -+ 16, // Bits per element -+ 2, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x120 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x121 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x122 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x123 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x124 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x125 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x126 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x127 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x128 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x129 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x12a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x12b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x12c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x12d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x12e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x12f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x130 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x131 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x132 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x133 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x134 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x135 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x136 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x137 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x138 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x139 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x13a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x13b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x13c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x13d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x13e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x13f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R8_UNORM (0x140) -+ { -+ "R8_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 8, // Bits per element -+ 1, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8_SNORM (0x141) -+ { -+ "R8_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 8, // Bits per element -+ 1, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8_SINT (0x142) -+ { -+ "R8_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 8, // Bits per element -+ 1, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8_UINT (0x143) -+ { -+ "R8_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 8, // Bits per element -+ 1, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // A8_UNORM (0x144) -+ { -+ "A8_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 3, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 8, // Bits per element -+ 1, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x145 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x146 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x147 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x148 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R8_SSCALED (0x149) -+ { -+ "R8_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 8, // Bits per element -+ 1, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8_USCALED (0x14a) -+ { -+ "R8_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 8, // Bits per element -+ 1, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 0, 0, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x14b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x14c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x14d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x14e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x14f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x150 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x151 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x152 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x153 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x154 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x155 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x156 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x157 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x158 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x159 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x15a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x15b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x15c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x15d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x15e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x15f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x160 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x161 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x162 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x163 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x164 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x165 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x166 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x167 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x168 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x169 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x16a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x16b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x16c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x16d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x16e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x16f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x170 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x171 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x172 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x173 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x174 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x175 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x176 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x177 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x178 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x179 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x17a (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x17b (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x17c (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x17d (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x17e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x17f (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x180 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x181 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x182 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // YCRCB_SWAPUVY (0x183) -+ { -+ "YCRCB_SWAPUVY", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ true, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 2, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x184 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x185 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // BC1_UNORM (0x186) -+ { -+ "BC1_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // BC2_UNORM (0x187) -+ { -+ "BC2_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // BC3_UNORM (0x188) -+ { -+ "BC3_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // BC4_UNORM (0x189) -+ { -+ "BC4_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // BC5_UNORM (0x18a) -+ { -+ "BC5_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // BC1_UNORM_SRGB (0x18b) -+ { -+ "BC1_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 1, // Num components -+ true, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // BC2_UNORM_SRGB (0x18c) -+ { -+ "BC2_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 1, // Num components -+ true, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // BC3_UNORM_SRGB (0x18d) -+ { -+ "BC3_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 1, // Num components -+ true, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // 0x18e (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // YCRCB_SWAPUV (0x18f) -+ { -+ "YCRCB_SWAPUV", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 8, 8, 8, 8 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ true, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 2, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x190 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x191 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x192 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R8G8B8_UNORM (0x193) -+ { -+ "R8G8B8_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 8, 8, 8, 0 }, // Bits per component -+ 24, // Bits per element -+ 3, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8_SNORM (0x194) -+ { -+ "R8G8B8_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 8, 8, 8, 0 }, // Bits per component -+ 24, // Bits per element -+ 3, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8_SSCALED (0x195) -+ { -+ "R8G8B8_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 8, 8, 8, 0 }, // Bits per component -+ 24, // Bits per element -+ 3, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8_USCALED (0x196) -+ { -+ "R8G8B8_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 8, 8, 8, 0 }, // Bits per component -+ 24, // Bits per element -+ 3, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x197 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x198 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // BC4_SNORM (0x199) -+ { -+ "BC4_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 64, // Bits per element -+ 8, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // BC5_SNORM (0x19a) -+ { -+ "BC5_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // R16G16B16_FLOAT (0x19b) -+ { -+ "R16G16B16_FLOAT", -+ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 16, 16, 16, 0 }, // Bits per component -+ 48, // Bits per element -+ 6, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16_UNORM (0x19c) -+ { -+ "R16G16B16_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 16, 16, 16, 0 }, // Bits per component -+ 48, // Bits per element -+ 6, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16_SNORM (0x19d) -+ { -+ "R16G16B16_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 16, 16, 16, 0 }, // Bits per component -+ 48, // Bits per element -+ 6, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16_SSCALED (0x19e) -+ { -+ "R16G16B16_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 16, 16, 16, 0 }, // Bits per component -+ 48, // Bits per element -+ 6, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16_USCALED (0x19f) -+ { -+ "R16G16B16_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 16, 16, 16, 0 }, // Bits per component -+ 48, // Bits per element -+ 6, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x1a0 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1a1 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // BC7_UNORM (0x1a2) -+ { -+ "BC7_UNORM", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 1, // Num components -+ false, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // BC7_UNORM_SRGB (0x1a3) -+ { -+ "BC7_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 0, 0, 0 }, // Swizzle -+ { 8, 0, 0, 0 }, // Bits per component -+ 128, // Bits per element -+ 16, // Bytes per element -+ 1, // Num components -+ true, // isSRGB -+ true, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor -+ 4, // bcWidth -+ 4, // bcHeight -+ }, -+ // 0x1a4 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1a5 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1a6 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1a7 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R8G8B8_UNORM_SRGB (0x1a8) -+ { -+ "R8G8B8_UNORM_SRGB", -+ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 8, 8, 8, 0 }, // Bits per component -+ 24, // Bits per element -+ 3, // Bytes per element -+ 3, // Num components -+ true, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x1a9 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1aa (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1ab (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1ac (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1ad (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1ae (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1af (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R16G16B16_UINT (0x1b0) -+ { -+ "R16G16B16_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 16, 16, 16, 0 }, // Bits per component -+ 48, // Bits per element -+ 6, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R16G16B16_SINT (0x1b1) -+ { -+ "R16G16B16_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 16, 16, 16, 0 }, // Bits per component -+ 48, // Bits per element -+ 6, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x1b2 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R10G10B10A2_SNORM (0x1b3) -+ { -+ "R10G10B10A2_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R10G10B10A2_USCALED (0x1b4) -+ { -+ "R10G10B10A2_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R10G10B10A2_SSCALED (0x1b5) -+ { -+ "R10G10B10A2_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R10G10B10A2_SINT (0x1b6) -+ { -+ "R10G10B10A2_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B10G10R10A2_SNORM (0x1b7) -+ { -+ "B10G10R10A2_SNORM", -+ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { true, true, true, true }, // Is normalized? -+ { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B10G10R10A2_USCALED (0x1b8) -+ { -+ "B10G10R10A2_USCALED", -+ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B10G10R10A2_SSCALED (0x1b9) -+ { -+ "B10G10R10A2_SSCALED", -+ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, -+ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B10G10R10A2_UINT (0x1ba) -+ { -+ "B10G10R10A2_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // B10G10R10A2_SINT (0x1bb) -+ { -+ "B10G10R10A2_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 2, 1, 0, 3 }, // Swizzle -+ { 10, 10, 10, 2 }, // Bits per component -+ 32, // Bits per element -+ 4, // Bytes per element -+ 4, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // 0x1bc (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1bd (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1be (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1bf (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1c0 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1c1 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1c2 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1c3 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1c4 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1c5 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1c6 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // 0x1c7 (Padding) -+ { -+ "UNKNOWN", -+ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, -+ { false, false, false, false }, -+ { 0.0f, 0.0f, 0.0f, 0.0f }, -+ 1, 1, }, -+ // R8G8B8_UINT (0x1c8) -+ { -+ "R8G8B8_UINT", -+ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 8, 8, 8, 0 }, // Bits per component -+ 24, // Bits per element -+ 3, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+ // R8G8B8_SINT (0x1c9) -+ { -+ "R8G8B8_SINT", -+ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, -+ { 0, 0, 0, 0x1 }, // Defaults for missing components -+ { 0, 1, 2, 0 }, // Swizzle -+ { 8, 8, 8, 0 }, // Bits per component -+ 24, // Bits per element -+ 3, // Bytes per element -+ 3, // Num components -+ false, // isSRGB -+ false, // isBC -+ false, // isSubsampled -+ { false, false, false, false }, // Is normalized? -+ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor -+ 1, // bcWidth -+ 1, // bcHeight -+ }, -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h -new file mode 100644 -index 0000000..ff1fdb2 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/formats.h -@@ -0,0 +1,222 @@ -+ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file formats.h -+* -+* @brief auto-generated file -+* -+* DO NOT EDIT -+* -+******************************************************************************/ -+ -+#pragma once -+ -+#include "common/os.h" -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_TYPE - Format component type -+////////////////////////////////////////////////////////////////////////// -+enum SWR_TYPE -+{ -+ SWR_TYPE_UNKNOWN, -+ SWR_TYPE_UNUSED, -+ SWR_TYPE_UNORM, -+ SWR_TYPE_SNORM, -+ SWR_TYPE_UINT, -+ SWR_TYPE_SINT, -+ SWR_TYPE_FLOAT, -+ SWR_TYPE_SSCALED, -+ SWR_TYPE_USCALED, -+}; -+////////////////////////////////////////////////////////////////////////// -+/// SWR_FORMAT -+////////////////////////////////////////////////////////////////////////// -+enum SWR_FORMAT -+{ -+ R32G32B32A32_FLOAT = 0x0, -+ R32G32B32A32_SINT = 0x1, -+ R32G32B32A32_UINT = 0x2, -+ R32G32B32X32_FLOAT = 0x6, -+ R32G32B32A32_SSCALED = 0x7, -+ R32G32B32A32_USCALED = 0x8, -+ R32G32B32_FLOAT = 0x40, -+ R32G32B32_SINT = 0x41, -+ R32G32B32_UINT = 0x42, -+ R32G32B32_SSCALED = 0x45, -+ R32G32B32_USCALED = 0x46, -+ R16G16B16A16_UNORM = 0x80, -+ R16G16B16A16_SNORM = 0x81, -+ R16G16B16A16_SINT = 0x82, -+ R16G16B16A16_UINT = 0x83, -+ R16G16B16A16_FLOAT = 0x84, -+ R32G32_FLOAT = 0x85, -+ R32G32_SINT = 0x86, -+ R32G32_UINT = 0x87, -+ R32_FLOAT_X8X24_TYPELESS = 0x88, -+ R16G16B16X16_UNORM = 0x8E, -+ R16G16B16X16_FLOAT = 0x8F, -+ R16G16B16A16_SSCALED = 0x93, -+ R16G16B16A16_USCALED = 0x94, -+ R32G32_SSCALED = 0x95, -+ R32G32_USCALED = 0x96, -+ R32_FLOAT_X8X24_TYPELESS_LD = 0x98, -+ B8G8R8A8_UNORM = 0xC0, -+ B8G8R8A8_UNORM_SRGB = 0xC1, -+ R10G10B10A2_UNORM = 0xC2, -+ R10G10B10A2_UNORM_SRGB = 0xC3, -+ R10G10B10A2_UINT = 0xC4, -+ R8G8B8A8_UNORM = 0xC7, -+ R8G8B8A8_UNORM_SRGB = 0xC8, -+ R8G8B8A8_SNORM = 0xC9, -+ R8G8B8A8_SINT = 0xCA, -+ R8G8B8A8_UINT = 0xCB, -+ R16G16_UNORM = 0xCC, -+ R16G16_SNORM = 0xCD, -+ R16G16_SINT = 0xCE, -+ R16G16_UINT = 0xCF, -+ R16G16_FLOAT = 0xD0, -+ B10G10R10A2_UNORM = 0xD1, -+ B10G10R10A2_UNORM_SRGB = 0xD2, -+ R11G11B10_FLOAT = 0xD3, -+ R32_SINT = 0xD6, -+ R32_UINT = 0xD7, -+ R32_FLOAT = 0xD8, -+ R24_UNORM_X8_TYPELESS = 0xD9, -+ R24_UNORM_X8_TYPELESS_LD = 0xDC, -+ A32_FLOAT = 0xE5, -+ B8G8R8X8_UNORM = 0xE9, -+ B8G8R8X8_UNORM_SRGB = 0xEA, -+ R8G8B8X8_UNORM = 0xEB, -+ R8G8B8X8_UNORM_SRGB = 0xEC, -+ R9G9B9E5_SHAREDEXP = 0xED, -+ B10G10R10X2_UNORM = 0xEE, -+ R10G10B10X2_USCALED = 0xF3, -+ R8G8B8A8_SSCALED = 0xF4, -+ R8G8B8A8_USCALED = 0xF5, -+ R16G16_SSCALED = 0xF6, -+ R16G16_USCALED = 0xF7, -+ R32_SSCALED = 0xF8, -+ R32_USCALED = 0xF9, -+ B5G6R5_UNORM = 0x100, -+ B5G6R5_UNORM_SRGB = 0x101, -+ B5G5R5A1_UNORM = 0x102, -+ B5G5R5A1_UNORM_SRGB = 0x103, -+ B4G4R4A4_UNORM = 0x104, -+ B4G4R4A4_UNORM_SRGB = 0x105, -+ R8G8_UNORM = 0x106, -+ R8G8_SNORM = 0x107, -+ R8G8_SINT = 0x108, -+ R8G8_UINT = 0x109, -+ R16_UNORM = 0x10A, -+ R16_SNORM = 0x10B, -+ R16_SINT = 0x10C, -+ R16_UINT = 0x10D, -+ R16_FLOAT = 0x10E, -+ A16_UNORM = 0x113, -+ A16_FLOAT = 0x117, -+ B5G5R5X1_UNORM = 0x11A, -+ B5G5R5X1_UNORM_SRGB = 0x11B, -+ R8G8_SSCALED = 0x11C, -+ R8G8_USCALED = 0x11D, -+ R16_SSCALED = 0x11E, -+ R16_USCALED = 0x11F, -+ R8_UNORM = 0x140, -+ R8_SNORM = 0x141, -+ R8_SINT = 0x142, -+ R8_UINT = 0x143, -+ A8_UNORM = 0x144, -+ R8_SSCALED = 0x149, -+ R8_USCALED = 0x14A, -+ YCRCB_SWAPUVY = 0x183, -+ BC1_UNORM = 0x186, -+ BC2_UNORM = 0x187, -+ BC3_UNORM = 0x188, -+ BC4_UNORM = 0x189, -+ BC5_UNORM = 0x18A, -+ BC1_UNORM_SRGB = 0x18B, -+ BC2_UNORM_SRGB = 0x18C, -+ BC3_UNORM_SRGB = 0x18D, -+ YCRCB_SWAPUV = 0x18F, -+ R8G8B8_UNORM = 0x193, -+ R8G8B8_SNORM = 0x194, -+ R8G8B8_SSCALED = 0x195, -+ R8G8B8_USCALED = 0x196, -+ BC4_SNORM = 0x199, -+ BC5_SNORM = 0x19A, -+ R16G16B16_FLOAT = 0x19B, -+ R16G16B16_UNORM = 0x19C, -+ R16G16B16_SNORM = 0x19D, -+ R16G16B16_SSCALED = 0x19E, -+ R16G16B16_USCALED = 0x19F, -+ BC7_UNORM = 0x1A2, -+ BC7_UNORM_SRGB = 0x1A3, -+ R8G8B8_UNORM_SRGB = 0x1A8, -+ R16G16B16_UINT = 0x1B0, -+ R16G16B16_SINT = 0x1B1, -+ R10G10B10A2_SNORM = 0x1B3, -+ R10G10B10A2_USCALED = 0x1B4, -+ R10G10B10A2_SSCALED = 0x1B5, -+ R10G10B10A2_SINT = 0x1B6, -+ B10G10R10A2_SNORM = 0x1B7, -+ B10G10R10A2_USCALED = 0x1B8, -+ B10G10R10A2_SSCALED = 0x1B9, -+ B10G10R10A2_UINT = 0x1BA, -+ B10G10R10A2_SINT = 0x1BB, -+ R8G8B8_UINT = 0x1C8, -+ R8G8B8_SINT = 0x1C9, -+ NUM_SWR_FORMATS = 0x1CA, -+}; -+////////////////////////////////////////////////////////////////////////// -+/// SWR_FORMAT_INFO - Format information -+////////////////////////////////////////////////////////////////////////// -+struct SWR_FORMAT_INFO -+{ -+ const char* name; -+ SWR_TYPE type[4]; -+ uint32_t defaults[4]; -+ uint32_t swizzle[4]; ///< swizzle per component -+ uint32_t bpc[4]; ///< bits per component -+ uint32_t bpp; ///< bits per pixel -+ uint32_t Bpp; ///< bytes per pixel -+ uint32_t numComps; ///< number of components -+ bool isSRGB; -+ bool isBC; -+ bool isSubsampled; -+ bool isNormalized[4]; -+ float toFloat[4]; -+ uint32_t bcWidth; -+ uint32_t bcHeight; -+}; -+ -+extern const SWR_FORMAT_INFO gFormatInfo[]; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Retrieves format info struct for given format. -+/// @param format - SWR format -+INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format) -+{ -+ return gFormatInfo[format]; -+} -+ -+// lookup table for unorm8 srgb -> float conversion -+extern const uint32_t srgb8Table[256]; -diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp -new file mode 100644 -index 0000000..ef38179 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/isa.hpp -@@ -0,0 +1,235 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+****************************************************************************/ -+ -+#pragma once -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#if defined(_WIN32) -+#include -+#else -+#include -+#include -+#endif -+ -+class InstructionSet -+{ -+public: -+ InstructionSet() : CPU_Rep() {}; -+ -+ // getters -+ std::string Vendor(void) { return CPU_Rep.vendor_; } -+ std::string Brand(void) { return CPU_Rep.brand_; } -+ -+ bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; } -+ bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; } -+ bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; } -+ bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; } -+ bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; } -+ bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; } -+ bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; } -+ bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; } -+ bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; } -+ bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; } -+ bool AES(void) { return CPU_Rep.f_1_ECX_[25]; } -+ bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; } -+ bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; } -+ bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; } -+ -+ bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; } -+ bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; } -+ bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; } -+ bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; } -+ bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; } -+ bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; } -+ bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; } -+ bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; } -+ bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; } -+ -+ bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; } -+ bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; } -+ bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; } -+ bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; } -+ bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; } -+ bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; } -+ bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; } -+ bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; } -+ bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; } -+ bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; } -+ -+ bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; } -+ -+ bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; } -+ bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; } -+ bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; } -+ bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; } -+ bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; } -+ bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; } -+ -+ bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; } -+ bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; } -+ bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; } -+ bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; } -+ bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; } -+ -+ bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; } -+ bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; } -+ bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; } -+ bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; } -+ bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; } -+ bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; } -+ bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; } -+ -+private: -+ class InstructionSet_Internal -+ { -+ public: -+ InstructionSet_Internal() -+ : nIds_{ 0 }, -+ nExIds_{ 0 }, -+ isIntel_{ false }, -+ isAMD_{ false }, -+ f_1_ECX_{ 0 }, -+ f_1_EDX_{ 0 }, -+ f_7_EBX_{ 0 }, -+ f_7_ECX_{ 0 }, -+ f_81_ECX_{ 0 }, -+ f_81_EDX_{ 0 }, -+ data_{}, -+ extdata_{} -+ { -+ //int cpuInfo[4] = {-1}; -+ std::array cpui; -+ -+ // Calling __cpuid with 0x0 as the function_id argument -+ // gets the number of the highest valid function ID. -+#if defined(_WIN32) -+ __cpuid(cpui.data(), 0); -+ nIds_ = cpui[0]; -+#else -+ nIds_ = __get_cpuid_max(0, NULL); -+#endif -+ -+ for (int i = 0; i <= nIds_; ++i) -+ { -+#if defined(_WIN32) -+ __cpuidex(cpui.data(), i, 0); -+#else -+ int *data = cpui.data(); -+ __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); -+#endif -+ data_.push_back(cpui); -+ } -+ -+ // Capture vendor string -+ char vendor[0x20]; -+ memset(vendor, 0, sizeof(vendor)); -+ *reinterpret_cast(vendor) = data_[0][1]; -+ *reinterpret_cast(vendor + 4) = data_[0][3]; -+ *reinterpret_cast(vendor + 8) = data_[0][2]; -+ vendor_ = vendor; -+ if (vendor_ == "GenuineIntel") -+ { -+ isIntel_ = true; -+ } -+ else if (vendor_ == "AuthenticAMD") -+ { -+ isAMD_ = true; -+ } -+ -+ // load bitset with flags for function 0x00000001 -+ if (nIds_ >= 1) -+ { -+ f_1_ECX_ = data_[1][2]; -+ f_1_EDX_ = data_[1][3]; -+ } -+ -+ // load bitset with flags for function 0x00000007 -+ if (nIds_ >= 7) -+ { -+ f_7_EBX_ = data_[7][1]; -+ f_7_ECX_ = data_[7][2]; -+ } -+ -+ // Calling __cpuid with 0x80000000 as the function_id argument -+ // gets the number of the highest valid extended ID. -+#if defined(_WIN32) -+ __cpuid(cpui.data(), 0x80000000); -+ nExIds_ = cpui[0]; -+#else -+ nExIds_ = __get_cpuid_max(0x80000000, NULL); -+#endif -+ -+ char brand[0x40]; -+ memset(brand, 0, sizeof(brand)); -+ -+ for (unsigned i = 0x80000000; i <= nExIds_; ++i) -+ { -+#if defined(_WIN32) -+ __cpuidex(cpui.data(), i, 0); -+#else -+ int *data = cpui.data(); -+ __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); -+#endif -+ extdata_.push_back(cpui); -+ } -+ -+ // load bitset with flags for function 0x80000001 -+ if (nExIds_ >= 0x80000001) -+ { -+ f_81_ECX_ = extdata_[1][2]; -+ f_81_EDX_ = extdata_[1][3]; -+ } -+ -+ // Interpret CPU brand string if reported -+ if (nExIds_ >= 0x80000004) -+ { -+ memcpy(brand, extdata_[2].data(), sizeof(cpui)); -+ memcpy(brand + 16, extdata_[3].data(), sizeof(cpui)); -+ memcpy(brand + 32, extdata_[4].data(), sizeof(cpui)); -+ brand_ = brand; -+ } -+ }; -+ -+ int nIds_; -+ unsigned nExIds_; -+ std::string vendor_; -+ std::string brand_; -+ bool isIntel_; -+ bool isAMD_; -+ std::bitset<32> f_1_ECX_; -+ std::bitset<32> f_1_EDX_; -+ std::bitset<32> f_7_EBX_; -+ std::bitset<32> f_7_ECX_; -+ std::bitset<32> f_81_ECX_; -+ std::bitset<32> f_81_EDX_; -+ std::vector> data_; -+ std::vector> extdata_; -+ }; -+ const InstructionSet_Internal CPU_Rep; -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h -new file mode 100644 -index 0000000..d7def2b ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/os.h -@@ -0,0 +1,194 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+****************************************************************************/ -+ -+#ifndef __SWR_OS_H__ -+#define __SWR_OS_H__ -+ -+#include "core/knobs.h" -+ -+#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX) -+ -+#define SWR_API __cdecl -+ -+#ifndef _CRT_SECURE_NO_WARNINGS -+#define _CRT_SECURE_NO_WARNINGS -+#endif -+ -+#ifndef NOMINMAX -+#define NOMINMAX -+#endif -+#include "Windows.h" -+#include -+#include -+ -+#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD -+#define THREAD __declspec(thread) -+#define INLINE __forceinline -+#define DEBUGBREAK __debugbreak() -+ -+#define PRAGMA_WARNING_PUSH_DISABLE(...) \ -+ __pragma(warning(push));\ -+ __pragma(warning(disable:__VA_ARGS__)); -+ -+#define PRAGMA_WARNING_POP() __pragma(warning(pop)) -+ -+#if defined(_WIN32) -+#if defined(_WIN64) -+#define BitScanForwardSizeT BitScanForward64 -+#define _mm_popcount_sizeT _mm_popcnt_u64 -+#else -+#define BitScanForwardSizeT BitScanForward -+#define _mm_popcount_sizeT _mm_popcnt_u32 -+#endif -+#endif -+ -+#elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) -+ -+#define SWR_API -+ -+#include -+#include -+#include -+#include -+#include -+ -+typedef void VOID; -+typedef void* LPVOID; -+typedef CARD8 BOOL; -+typedef wchar_t WCHAR; -+typedef uint16_t UINT16; -+typedef int INT; -+typedef int INT32; -+typedef unsigned int UINT; -+typedef uint32_t UINT32; -+typedef uint64_t UINT64; -+typedef int64_t INT64; -+typedef void* HANDLE; -+typedef float FLOAT; -+typedef int LONG; -+typedef CARD8 BYTE; -+typedef unsigned char UCHAR; -+typedef unsigned int DWORD; -+ -+#undef FALSE -+#define FALSE 0 -+ -+#undef TRUE -+#define TRUE 1 -+ -+#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH))) -+#define THREAD __thread -+#ifndef INLINE -+#define INLINE __inline -+#endif -+#define DEBUGBREAK asm ("int $3") -+#define __cdecl -+#define __declspec(X) -+ -+#define GCC_VERSION (__GNUC__ * 10000 \ -+ + __GNUC_MINOR__ * 100 \ -+ + __GNUC_PATCHLEVEL__) -+ -+#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500) -+inline -+uint64_t __rdtsc() -+{ -+ long low, high; -+ asm volatile("rdtsc" : "=a"(low), "=d"(high)); -+ return (low | ((uint64_t)high << 32)); -+} -+#endif -+ -+// Intrinsic not defined in gcc -+static INLINE -+void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a) -+{ -+ _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a)); -+ _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1)); -+} -+ -+inline -+unsigned char _BitScanForward(unsigned int *Index, unsigned int Mask) -+{ -+ *Index = __builtin_ctz(Mask); -+ return (Mask != 0); -+} -+ -+inline -+unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask) -+{ -+ *Index = __builtin_clz(Mask); -+ return (Mask != 0); -+} -+ -+inline -+void *_aligned_malloc(unsigned int size, unsigned int alignment) -+{ -+ void *ret; -+ if (posix_memalign(&ret, alignment, size)) -+ { -+ return NULL; -+ } -+ return ret; -+} -+ -+inline -+unsigned char _bittest(const LONG *a, LONG b) -+{ -+ return ((*(unsigned *)(a) & (1 << b)) != 0); -+} -+ -+#if defined(_WIN32) -+static inline -+unsigned int _mm_popcnt_u32(unsigned int v) -+{ -+ return __builtin_popcount(v); -+} -+#endif -+ -+#define _aligned_free free -+#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange) -+#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value) -+#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1) -+#define _ReadWriteBarrier() asm volatile("" ::: "memory") -+#define __stdcall -+ -+#define PRAGMA_WARNING_PUSH_DISABLE(...) -+#define PRAGMA_WARNING_POP() -+ -+#else -+ -+#error Unsupported OS/system. -+ -+#endif -+ -+#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64) -+#if KNOB_SIMD_WIDTH == 8 -+#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32) -+#else -+#error Unknown SIMD width! -+#endif -+ -+#include "common/swr_assert.h" -+ -+#endif//__SWR_OS_H__ -diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp -new file mode 100644 -index 0000000..469302b ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp -@@ -0,0 +1,176 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file rdtsc_buckets.cpp -+* -+* @brief implementation of rdtsc buckets. -+* -+* Notes: -+* -+******************************************************************************/ -+#include "rdtsc_buckets.h" -+#include -+ -+THREAD UINT tlsThreadId = 0; -+ -+void BucketManager::RegisterThread(const std::string& name) -+{ -+ BUCKET_THREAD newThread; -+ newThread.name = name; -+ newThread.root.children.reserve(mBuckets.size()); -+ newThread.root.id = 0; -+ newThread.root.pParent = nullptr; -+ newThread.pCurrent = &newThread.root; -+ -+ mThreadMutex.lock(); -+ -+ // assign unique thread id for this thread -+ size_t id = mThreads.size(); -+ newThread.id = (UINT)id; -+ tlsThreadId = (UINT)id; -+ -+ // open threadviz file if enabled -+ if (mThreadViz) -+ { -+ char fileName[255]; -+ sprintf(fileName, "threadviz_thread.%d.dat", newThread.id); -+ newThread.vizFile = fopen(fileName, "wb"); -+ } -+ -+ // store new thread -+ mThreads.push_back(newThread); -+ -+ mThreadMutex.unlock(); -+} -+ -+UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc) -+{ -+ size_t id = mBuckets.size(); -+ mBuckets.push_back(desc); -+ return (UINT)id; -+} -+ -+void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket) -+{ -+ const char *arrows[] = { -+ "", -+ "|-> ", -+ " |-> ", -+ " |-> ", -+ " |-> ", -+ " |-> ", -+ " |-> " -+ }; -+ -+ // compute percent of total cycles used by this bucket -+ float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0); -+ -+ // compute percent of parent cycles used by this bucket -+ float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0); -+ -+ // compute average cycle count per invocation -+ UINT64 CPE = bucket.elapsed / bucket.count; -+ -+ BUCKET_DESC &desc = mBuckets[bucket.id]; -+ -+ // construct hierarchy visualization -+ char hier[80]; -+ strcpy(hier, arrows[level]); -+ strcat(hier, desc.name.c_str()); -+ -+ // print out -+ fprintf(f, "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n", percentTotal, percentParent, bucket.elapsed, CPE, bucket.count, (unsigned long)0, (UINT32)(0), hier); -+ -+ // dump all children of this bucket -+ for (const BUCKET& child : bucket.children) -+ { -+ if (child.count) -+ { -+ PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child); -+ } -+ } -+} -+ -+void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread) -+{ -+ // print header -+ fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str()); -+ fprintf(f, " %%Tot %%Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket\n"); -+ -+ // compute thread level total cycle counts across all buckets from root -+ const BUCKET& root = thread.root; -+ UINT64 totalCycles = 0; -+ for (const BUCKET& child : root.children) -+ { -+ totalCycles += child.elapsed; -+ } -+ -+ for (const BUCKET& child : root.children) -+ { -+ if (child.count) -+ { -+ PrintBucket(f, 0, totalCycles, totalCycles, child); -+ } -+ } -+} -+ -+void BucketManager::DumpThreadViz() -+{ -+ // ensure all thread data is flushed -+ mThreadMutex.lock(); -+ for (auto& thread : mThreads) -+ { -+ fflush(thread.vizFile); -+ fclose(thread.vizFile); -+ } -+ mThreadMutex.unlock(); -+ -+ // dump bucket descriptions -+ FILE* f = fopen("threadviz_buckets.dat", "wb"); -+ for (auto& bucket : mBuckets) -+ { -+ Serialize(f, bucket); -+ } -+ fclose(f); -+} -+ -+void BucketManager::PrintReport(const std::string& filename) -+{ -+ if (mThreadViz) -+ { -+ DumpThreadViz(); -+ } -+ else -+ { -+ FILE* f = fopen(filename.c_str(), "w"); -+ -+ mThreadMutex.lock(); -+ for (const BUCKET_THREAD& thread : mThreads) -+ { -+ PrintThread(f, thread); -+ fprintf(f, "\n"); -+ } -+ mThreadMutex.unlock(); -+ -+ fclose(f); -+ } -+} -diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h -new file mode 100644 -index 0000000..03530f5 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h -@@ -0,0 +1,195 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file rdtsc_buckets.h -+* -+* @brief declaration for rdtsc buckets. -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+ -+#include "os.h" -+#include -+#include -+ -+#include "rdtsc_buckets_shared.h" -+ -+// unique thread id stored in thread local storage -+extern THREAD UINT tlsThreadId; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief BucketManager encapsulates a single instance of the buckets -+/// functionality. There can be one or many bucket managers active -+/// at any time. The manager owns all the threads and -+/// bucket information that have been registered to it. -+class BucketManager -+{ -+public: -+ BucketManager(bool enableThreadViz) : mThreadViz(enableThreadViz) {} -+ -+ // removes all registered thread data -+ void ClearThreads() -+ { -+ mThreadMutex.lock(); -+ mThreads.clear(); -+ mThreadMutex.unlock(); -+ } -+ -+ // removes all registered buckets -+ void ClearBuckets() -+ { -+ mBuckets.clear(); -+ } -+ -+ /// Registers a new thread with the manager. -+ /// @param name - name of thread, used for labels in reports and threadviz -+ void RegisterThread(const std::string& name); -+ -+ /// Registers a new bucket type with the manager. Returns a unique -+ /// id which should be used in subsequent calls to start/stop the bucket -+ /// @param desc - description of the bucket -+ /// @return unique id -+ UINT RegisterBucket(const BUCKET_DESC& desc); -+ -+ // dump threadviz data -+ void DumpThreadViz(); -+ -+ // print report -+ void PrintReport(const std::string& filename); -+ -+ // start capturing -+ INLINE void StartCapture() -+ { -+ mCapturing = true; -+ } -+ -+ // stop capturing -+ INLINE void StopCapture() -+ { -+ mCapturing = false; -+ -+ // wait for all threads to pop back to root bucket -+ bool stillCapturing = true; -+ while (stillCapturing) -+ { -+ stillCapturing = false; -+ for (const BUCKET_THREAD& t : mThreads) -+ { -+ if (t.pCurrent != &t.root) -+ { -+ stillCapturing = true; -+ continue; -+ } -+ } -+ } -+ } -+ -+ // start a bucket -+ // @param id generated by RegisterBucket -+ INLINE void StartBucket(UINT id) -+ { -+ if (!mCapturing) return; -+ -+ SWR_ASSERT(tlsThreadId < mThreads.size()); -+ -+ BUCKET_THREAD& bt = mThreads[tlsThreadId]; -+ -+ // if threadviz is enabled, only need to dump start info to threads viz file -+ if (mThreadViz) -+ { -+ SWR_ASSERT(bt.vizFile != nullptr); -+ if (mBuckets[id].enableThreadViz) -+ { -+ VIZ_START_DATA data{ VIZ_START, id, __rdtsc() }; -+ Serialize(bt.vizFile, data); -+ } -+ } -+ else -+ { -+ if (bt.pCurrent->children.size() < mBuckets.size()) -+ { -+ bt.pCurrent->children.resize(mBuckets.size()); -+ } -+ BUCKET &child = bt.pCurrent->children[id]; -+ child.pParent = bt.pCurrent; -+ child.id = id; -+ child.start = __rdtsc(); -+ -+ // update thread's currently executing bucket -+ bt.pCurrent = &child; -+ } -+ -+ bt.level++; -+ } -+ -+ // stop the currently executing bucket -+ INLINE void StopBucket(UINT id) -+ { -+ SWR_ASSERT(tlsThreadId < mThreads.size()); -+ BUCKET_THREAD &bt = mThreads[tlsThreadId]; -+ -+ if (bt.level == 0) return; -+ -+ if (mThreadViz) -+ { -+ SWR_ASSERT(bt.vizFile != nullptr); -+ if (mBuckets[id].enableThreadViz) -+ { -+ VIZ_STOP_DATA data{ VIZ_STOP, __rdtsc() }; -+ Serialize(bt.vizFile, data); -+ } -+ } -+ else -+ { -+ if (bt.pCurrent->start == 0) return; -+ SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected"); -+ -+ bt.pCurrent->elapsed += (__rdtsc() - bt.pCurrent->start); -+ bt.pCurrent->count++; -+ -+ // pop to parent -+ bt.pCurrent = bt.pCurrent->pParent; -+ } -+ -+ bt.level--; -+ } -+ -+private: -+ void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket); -+ void PrintThread(FILE* f, const BUCKET_THREAD& thread); -+ -+ // list of active threads that have registered with this manager -+ std::vector mThreads; -+ -+ // list of buckets registered with this manager -+ std::vector mBuckets; -+ -+ // is capturing currently enabled -+ volatile bool mCapturing{ false }; -+ -+ std::mutex mThreadMutex; -+ -+ // enable threadviz -+ bool mThreadViz{ false }; -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h -new file mode 100644 -index 0000000..41c6d5d ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h -@@ -0,0 +1,167 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file rdtsc_buckets.h -+* -+* @brief declaration for rdtsc buckets. -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+ -+#include -+#include -+ -+struct BUCKET -+{ -+ uint32_t id{ 0 }; -+ uint64_t start{ 0 }; -+ uint64_t elapsed{ 0 }; -+ uint32_t count{ 0 }; -+ -+ BUCKET* pParent{ nullptr }; -+ std::vector children; -+}; -+ -+struct BUCKET_DESC -+{ -+ // name of bucket, used in reports -+ std::string name; -+ -+ // description of bucket, used in threadviz -+ std::string description; -+ -+ // enable for threadviz dumping -+ bool enableThreadViz; -+ -+ // threadviz color of bucket, in RGBA8_UNORM format -+ uint32_t color; -+}; -+ -+struct BUCKET_THREAD -+{ -+ // name of thread, used in reports -+ std::string name; -+ -+ // id for this thread, assigned by the thread manager -+ uint32_t id; -+ -+ // root of the bucket hierarchy for this thread -+ BUCKET root; -+ -+ // currently executing bucket somewhere in the hierarchy -+ BUCKET* pCurrent; -+ -+ // currently executing hierarchy level -+ uint32_t level{ 0 }; -+ -+ // threadviz file object -+ FILE* vizFile{ nullptr }; -+ -+ BUCKET_THREAD() {} -+ BUCKET_THREAD(const BUCKET_THREAD& that) -+ { -+ name = that.name; -+ id = that.id; -+ root = that.root; -+ pCurrent = &root; -+ vizFile = that.vizFile; -+ } -+}; -+ -+enum VIZ_TYPE -+{ -+ VIZ_START = 0, -+ VIZ_STOP = 1, -+ VIZ_DATA = 2 -+}; -+ -+struct VIZ_START_DATA -+{ -+ uint8_t type; -+ uint32_t bucketId; -+ uint64_t timestamp; -+}; -+ -+struct VIZ_STOP_DATA -+{ -+ uint8_t type; -+ uint64_t timestamp; -+}; -+ -+inline void Serialize(FILE* f, const VIZ_START_DATA& data) -+{ -+ fwrite(&data, sizeof(VIZ_START_DATA), 1, f); -+} -+ -+inline void Deserialize(FILE* f, VIZ_START_DATA& data) -+{ -+ fread(&data, sizeof(VIZ_START_DATA), 1, f); -+ assert(data.type == VIZ_START); -+} -+ -+inline void Serialize(FILE* f, const VIZ_STOP_DATA& data) -+{ -+ fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f); -+} -+ -+inline void Deserialize(FILE* f, VIZ_STOP_DATA& data) -+{ -+ fread(&data, sizeof(VIZ_STOP_DATA), 1, f); -+ assert(data.type == VIZ_STOP); -+} -+ -+inline void Serialize(FILE* f, const std::string& string) -+{ -+ assert(string.size() <= 256); -+ -+ uint8_t length = (uint8_t)string.size(); -+ fwrite(&length, sizeof(length), 1, f); -+ fwrite(string.c_str(), string.size(), 1, f); -+} -+ -+inline void Deserialize(FILE* f, std::string& string) -+{ -+ char cstr[256]; -+ uint8_t length; -+ fread(&length, sizeof(length), 1, f); -+ fread(cstr, length, 1, f); -+ cstr[length] = 0; -+ string.assign(cstr); -+} -+ -+inline void Serialize(FILE* f, const BUCKET_DESC& desc) -+{ -+ Serialize(f, desc.name); -+ Serialize(f, desc.description); -+ fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f); -+ fwrite(&desc.color, sizeof(desc.color), 1, f); -+} -+ -+inline void Deserialize(FILE* f, BUCKET_DESC& desc) -+{ -+ Deserialize(f, desc.name); -+ Deserialize(f, desc.description); -+ fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f); -+ fread(&desc.color, sizeof(desc.color), 1, f); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h -new file mode 100644 -index 0000000..ef7804f ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h -@@ -0,0 +1,792 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+****************************************************************************/ -+ -+#ifndef __SWR_SIMDINTRIN_H__ -+#define __SWR_SIMDINTRIN_H__ -+ -+#include "os.h" -+ -+#include -+ -+#include -+#include -+#include -+ -+#if KNOB_SIMD_WIDTH == 8 -+typedef __m256 simdscalar; -+typedef __m256i simdscalari; -+typedef uint8_t simdmask; -+#else -+#error Unsupported vector width -+#endif -+ -+// simd vector -+OSALIGNSIMD(union) simdvector -+{ -+ simdscalar v[4]; -+ struct -+ { -+ simdscalar x, y, z, w; -+ }; -+ -+ simdscalar& operator[] (const int i) { return v[i]; } -+ const simdscalar& operator[] (const int i) const { return v[i]; } -+}; -+ -+#if KNOB_SIMD_WIDTH == 8 -+#define _simd128_maskstore_ps _mm_maskstore_ps -+#define _simd_load_ps _mm256_load_ps -+#define _simd_load1_ps _mm256_broadcast_ss -+#define _simd_loadu_ps _mm256_loadu_ps -+#define _simd_setzero_ps _mm256_setzero_ps -+#define _simd_set1_ps _mm256_set1_ps -+#define _simd_blend_ps _mm256_blend_ps -+#define _simd_blendv_ps _mm256_blendv_ps -+#define _simd_store_ps _mm256_store_ps -+#define _simd_mul_ps _mm256_mul_ps -+#define _simd_add_ps _mm256_add_ps -+#define _simd_sub_ps _mm256_sub_ps -+#define _simd_rsqrt_ps _mm256_rsqrt_ps -+#define _simd_min_ps _mm256_min_ps -+#define _simd_max_ps _mm256_max_ps -+#define _simd_movemask_ps _mm256_movemask_ps -+#define _simd_cvtps_epi32 _mm256_cvtps_epi32 -+#define _simd_cvttps_epi32 _mm256_cvttps_epi32 -+#define _simd_cvtepi32_ps _mm256_cvtepi32_ps -+#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ) -+#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ) -+#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ) -+#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ) -+#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) -+#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) -+#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm) -+#define _simd_and_ps _mm256_and_ps -+#define _simd_or_ps _mm256_or_ps -+ -+#define _simd_rcp_ps _mm256_rcp_ps -+#define _simd_div_ps _mm256_div_ps -+#define _simd_castsi_ps _mm256_castsi256_ps -+#define _simd_andnot_ps _mm256_andnot_ps -+#define _simd_round_ps _mm256_round_ps -+#define _simd_castpd_ps _mm256_castpd_ps -+#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a)) -+ -+#define _simd_load_sd _mm256_load_sd -+#define _simd_movemask_pd _mm256_movemask_pd -+#define _simd_castsi_pd _mm256_castsi256_pd -+ -+// emulated integer simd -+#define SIMD_EMU_EPI(func, intrin) \ -+INLINE \ -+__m256i func(__m256i a, __m256i b)\ -+{\ -+ __m128i aHi = _mm256_extractf128_si256(a, 1);\ -+ __m128i bHi = _mm256_extractf128_si256(b, 1);\ -+ __m128i aLo = _mm256_castsi256_si128(a);\ -+ __m128i bLo = _mm256_castsi256_si128(b);\ -+\ -+ __m128i subLo = intrin(aLo, bLo);\ -+ __m128i subHi = intrin(aHi, bHi);\ -+\ -+ __m256i result = _mm256_castsi128_si256(subLo);\ -+ result = _mm256_insertf128_si256(result, subHi, 1);\ -+\ -+ return result;\ -+} -+ -+#if (KNOB_ARCH == KNOB_ARCH_AVX) -+#define _simd_mul_epi32 _simdemu_mul_epi32 -+#define _simd_mullo_epi32 _simdemu_mullo_epi32 -+#define _simd_sub_epi32 _simdemu_sub_epi32 -+#define _simd_sub_epi64 _simdemu_sub_epi64 -+#define _simd_min_epi32 _simdemu_min_epi32 -+#define _simd_min_epu32 _simdemu_min_epu32 -+#define _simd_max_epi32 _simdemu_max_epi32 -+#define _simd_max_epu32 _simdemu_max_epu32 -+#define _simd_add_epi32 _simdemu_add_epi32 -+#define _simd_and_si _simdemu_and_si -+#define _simd_andnot_si _simdemu_andnot_si -+#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32 -+#define _simd_cmplt_epi32 _simdemu_cmplt_epi32 -+#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32 -+#define _simd_or_si _simdemu_or_si -+#define _simd_castps_si _mm256_castps_si256 -+#define _simd_adds_epu8 _simdemu_adds_epu8 -+#define _simd_subs_epu8 _simdemu_subs_epu8 -+#define _simd_add_epi8 _simdemu_add_epi8 -+#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64 -+#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64 -+ -+SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32) -+SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32) -+SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32) -+SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64) -+SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32) -+SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32) -+SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32) -+SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32) -+SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32) -+SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128) -+SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128) -+SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32) -+SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32) -+SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32) -+SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128) -+SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8) -+SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8) -+SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8) -+SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64) -+SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64) -+ -+#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) -+#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) -+ -+#define _simd_srli_si(a,i) _simdemu_srli_si128(a) -+#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a) -+#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a) -+#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a) -+#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128(_mm256_castps_si256(a))) -+ -+#define _simd128_fmadd_ps _mm_fmaddemu_ps -+#define _simd_fmadd_ps _mm_fmaddemu256_ps -+#define _simd_fmsub_ps _mm_fmsubemu256_ps -+#define _simd_shuffle_epi8 _simdemu_shuffle_epi8 -+SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8) -+ -+INLINE -+__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c) -+{ -+ __m128 res = _mm_mul_ps(a, b); -+ res = _mm_add_ps(res, c); -+ return res; -+} -+ -+INLINE -+__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c) -+{ -+ __m256 res = _mm256_mul_ps(a, b); -+ res = _mm256_add_ps(res, c); -+ return res; -+} -+ -+INLINE -+__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c) -+{ -+ __m256 res = _mm256_mul_ps(a, b); -+ res = _mm256_sub_ps(res, c); -+ return res; -+} -+ -+INLINE -+__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale) -+{ -+ uint32_t *pOffsets = (uint32_t*)&vOffsets; -+ simdscalar vResult; -+ float* pResult = (float*)&vResult; -+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) -+ { -+ uint32_t offset = pOffsets[i]; -+ offset = offset * scale; -+ pResult[i] = *(float*)(((const uint8_t*)pBase + offset)); -+ } -+ -+ return vResult; -+} -+ -+INLINE -+__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale) -+{ -+ uint32_t *pOffsets = (uint32_t*)&vOffsets; -+ simdscalar vResult = vSrc; -+ float* pResult = (float*)&vResult; -+ DWORD index; -+ uint32_t mask = _simd_movemask_ps(vMask); -+ while (_BitScanForward(&index, mask)) -+ { -+ mask &= ~(1 << index); -+ uint32_t offset = pOffsets[index]; -+ offset = offset * scale; -+ pResult[index] = *(float*)(((const uint8_t*)pBase + offset)); -+ } -+ -+ return vResult; -+} -+ -+INLINE -+__m256i _simd_abs_epi32(__m256i a) -+{ -+ __m128i aHi = _mm256_extractf128_si256(a, 1); -+ __m128i aLo = _mm256_castsi256_si128(a); -+ __m128i absLo = _mm_abs_epi32(aLo); -+ __m128i absHi = _mm_abs_epi32(aHi); -+ __m256i result = _mm256_castsi128_si256(absLo); -+ result = _mm256_insertf128_si256(result, absHi, 1); -+ return result; -+} -+#else -+ -+#define _simd_mul_epi32 _mm256_mul_epi32 -+#define _simd_mullo_epi32 _mm256_mullo_epi32 -+#define _simd_sub_epi32 _mm256_sub_epi32 -+#define _simd_sub_epi64 _mm256_sub_epi64 -+#define _simd_min_epi32 _mm256_min_epi32 -+#define _simd_max_epi32 _mm256_max_epi32 -+#define _simd_min_epu32 _mm256_min_epu32 -+#define _simd_max_epu32 _mm256_max_epu32 -+#define _simd_add_epi32 _mm256_add_epi32 -+#define _simd_and_si _mm256_and_si256 -+#define _simd_andnot_si _mm256_andnot_si256 -+#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32 -+#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a) -+#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b) -+#define _simd_or_si _mm256_or_si256 -+#define _simd_castps_si _mm256_castps_si256 -+ -+#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32 -+#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32 -+ -+#define _simd_srli_si(a,i) _simdemu_srli_si128(a) -+#define _simd_slli_epi32 _mm256_slli_epi32 -+#define _simd_srai_epi32 _mm256_srai_epi32 -+#define _simd_srli_epi32 _mm256_srli_epi32 -+#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128(_mm256_castps_si256(a))) -+#define _simd128_fmadd_ps _mm_fmadd_ps -+#define _simd_fmadd_ps _mm256_fmadd_ps -+#define _simd_fmsub_ps _mm256_fmsub_ps -+#define _simd_shuffle_epi8 _mm256_shuffle_epi8 -+#define _simd_adds_epu8 _mm256_adds_epu8 -+#define _simd_subs_epu8 _mm256_subs_epu8 -+#define _simd_add_epi8 _mm256_add_epi8 -+#define _simd_i32gather_ps _mm256_i32gather_ps -+#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps -+#define _simd_abs_epi32 _mm256_abs_epi32 -+ -+#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64 -+#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64 -+#endif -+ -+#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm)) -+#define _simd_shuffle_ps _mm256_shuffle_ps -+#define _simd_set1_epi32 _mm256_set1_epi32 -+#define _simd_set1_epi8 _mm256_set1_epi8 -+#define _simd_setzero_si _mm256_setzero_si256 -+#define _simd_cvttps_epi32 _mm256_cvttps_epi32 -+#define _simd_store_si _mm256_store_si256 -+#define _simd_broadcast_ss _mm256_broadcast_ss -+#define _simd_maskstore_ps _mm256_maskstore_ps -+#define _simd_load_si _mm256_load_si256 -+#define _simd_loadu_si _mm256_loadu_si256 -+#define _simd_sub_ps _mm256_sub_ps -+#define _simd_testz_ps _mm256_testz_ps -+#define _simd_xor_ps _mm256_xor_ps -+ -+ -+INLINE -+simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask) -+{ -+ return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask)); -+} -+ -+// convert bitmask to vector mask -+INLINE -+simdscalar vMask(int32_t mask) -+{ -+ __m256i vec = _mm256_set1_epi32(mask); -+ const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); -+ vec = _simd_and_si(vec, bit); -+ vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec); -+ return _simd_castsi_ps(vec); -+} -+ -+INLINE -+void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane) -+{ -+ OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH]; -+ _mm256_store_ps(rArray, r); -+ _mm256_store_ps(sArray, s); -+ rArray[rlane] = sArray[slane]; -+ r = _mm256_load_ps(rArray); -+} -+ -+template -+__m256i _simdemu_srli_si128(__m256i a) -+{ -+ __m128i aHi = _mm256_extractf128_si256(a, 1); -+ __m128i aLo = _mm256_castsi256_si128(a); -+ -+ __m128i resHi = _mm_srli_si128(aHi, i); -+ __m128i resLo = _mm_alignr_epi8(aHi, aLo, i); -+ -+ __m256i result = _mm256_castsi128_si256(resLo); -+ result = _mm256_insertf128_si256(result, resHi, 1); -+ -+ return result; -+} -+ -+template -+__m256i _simdemu_slli_epi32(__m256i a) -+{ -+ __m128i aHi = _mm256_extractf128_si256(a, 1); -+ __m128i aLo = _mm256_castsi256_si128(a); -+ -+ __m128i resHi = _mm_slli_epi32(aHi, i); -+ __m128i resLo = _mm_slli_epi32(aLo, i); -+ -+ __m256i result = _mm256_castsi128_si256(resLo); -+ result = _mm256_insertf128_si256(result, resHi, 1); -+ -+ return result; -+} -+ -+template -+__m256i _simdemu_srai_epi32(__m256i a) -+{ -+ __m128i aHi = _mm256_extractf128_si256(a, 1); -+ __m128i aLo = _mm256_castsi256_si128(a); -+ -+ __m128i resHi = _mm_srai_epi32(aHi, i); -+ __m128i resLo = _mm_srai_epi32(aLo, i); -+ -+ __m256i result = _mm256_castsi128_si256(resLo); -+ result = _mm256_insertf128_si256(result, resHi, 1); -+ -+ return result; -+} -+ -+template -+__m256i _simdemu_srli_epi32(__m256i a) -+{ -+ __m128i aHi = _mm256_extractf128_si256(a, 1); -+ __m128i aLo = _mm256_castsi256_si128(a); -+ -+ __m128i resHi = _mm_srli_epi32(aHi, i); -+ __m128i resLo = _mm_srli_epi32(aLo, i); -+ -+ __m256i result = _mm256_castsi128_si256(resLo); -+ result = _mm256_insertf128_si256(result, resHi, 1); -+ -+ return result; -+} -+ -+INLINE -+void _simdvec_transpose(simdvector &v) -+{ -+ SWR_ASSERT(false, "Need to implement 8 wide version"); -+} -+ -+#else -+#error Unsupported vector width -+#endif -+ -+// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww. -+INLINE -+void _simdvec_load_ps(simdvector& r, const float *p) -+{ -+ r[0] = _simd_set1_ps(p[0]); -+ r[1] = _simd_set1_ps(p[1]); -+ r[2] = _simd_set1_ps(p[2]); -+ r[3] = _simd_set1_ps(p[3]); -+} -+ -+INLINE -+void _simdvec_mov(simdvector& r, const simdscalar& s) -+{ -+ r[0] = s; -+ r[1] = s; -+ r[2] = s; -+ r[3] = s; -+} -+ -+INLINE -+void _simdvec_mov(simdvector& r, const simdvector& v) -+{ -+ r[0] = v[0]; -+ r[1] = v[1]; -+ r[2] = v[2]; -+ r[3] = v[3]; -+} -+ -+// just move a lane from the source simdvector to dest simdvector -+INLINE -+void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane) -+{ -+ _simd_mov(r[0], rlane, s[0], slane); -+ _simd_mov(r[1], rlane, s[1], slane); -+ _simd_mov(r[2], rlane, s[2], slane); -+ _simd_mov(r[3], rlane, s[3], slane); -+} -+ -+INLINE -+void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) -+{ -+ simdscalar tmp; -+ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) -+ -+ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) -+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) -+ -+ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) -+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) -+} -+ -+INLINE -+void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) -+{ -+ simdscalar tmp; -+ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) -+ -+ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) -+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) -+ -+ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) -+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) -+ -+ tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w) -+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) -+} -+ -+INLINE -+simdscalar _simdvec_rcp_length_ps(const simdvector& v) -+{ -+ simdscalar length; -+ _simdvec_dp4_ps(length, v, v); -+ return _simd_rsqrt_ps(length); -+} -+ -+INLINE -+void _simdvec_normalize_ps(simdvector& r, const simdvector& v) -+{ -+ simdscalar vecLength; -+ vecLength = _simdvec_rcp_length_ps(v); -+ -+ r[0] = _simd_mul_ps(v[0], vecLength); -+ r[1] = _simd_mul_ps(v[1], vecLength); -+ r[2] = _simd_mul_ps(v[2], vecLength); -+ r[3] = _simd_mul_ps(v[3], vecLength); -+} -+ -+INLINE -+void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s) -+{ -+ r[0] = _simd_mul_ps(v[0], s); -+ r[1] = _simd_mul_ps(v[1], s); -+ r[2] = _simd_mul_ps(v[2], s); -+ r[3] = _simd_mul_ps(v[3], s); -+} -+ -+INLINE -+void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1) -+{ -+ r[0] = _simd_mul_ps(v0[0], v1[0]); -+ r[1] = _simd_mul_ps(v0[1], v1[1]); -+ r[2] = _simd_mul_ps(v0[2], v1[2]); -+ r[3] = _simd_mul_ps(v0[3], v1[3]); -+} -+ -+INLINE -+void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1) -+{ -+ r[0] = _simd_add_ps(v0[0], v1[0]); -+ r[1] = _simd_add_ps(v0[1], v1[1]); -+ r[2] = _simd_add_ps(v0[2], v1[2]); -+ r[3] = _simd_add_ps(v0[3], v1[3]); -+} -+ -+INLINE -+void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s) -+{ -+ r[0] = _simd_min_ps(v0[0], s); -+ r[1] = _simd_min_ps(v0[1], s); -+ r[2] = _simd_min_ps(v0[2], s); -+ r[3] = _simd_min_ps(v0[3], s); -+} -+ -+INLINE -+void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s) -+{ -+ r[0] = _simd_max_ps(v0[0], s); -+ r[1] = _simd_max_ps(v0[1], s); -+ r[2] = _simd_max_ps(v0[2], s); -+ r[3] = _simd_max_ps(v0[3], s); -+} -+ -+// Matrix4x4 * Vector4 -+// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w) -+// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w) -+// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w) -+// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) -+INLINE -+void _simd_mat4x4_vec4_multiply( -+ simdvector& result, -+ const float *pMatrix, -+ const simdvector& v) -+{ -+ simdscalar m; -+ simdscalar r0; -+ simdscalar r1; -+ -+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] -+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) -+ result[0] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] -+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) -+ result[1] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] -+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) -+ result[2] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] -+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) -+ result[3] = r0; -+} -+ -+// Matrix4x4 * Vector3 - Direction Vector where w = 0. -+// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0) -+// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0) -+// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0) -+// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) -+INLINE -+void _simd_mat3x3_vec3_w0_multiply( -+ simdvector& result, -+ const float *pMatrix, -+ const simdvector& v) -+{ -+ simdscalar m; -+ simdscalar r0; -+ simdscalar r1; -+ -+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ result[0] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ result[1] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ result[2] = r0; -+ -+ result[3] = _simd_setzero_ps(); -+} -+ -+// Matrix4x4 * Vector3 - Position vector where w = 1. -+// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1) -+// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1) -+// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1) -+// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) -+INLINE -+void _simd_mat4x4_vec3_w1_multiply( -+ simdvector& result, -+ const float *pMatrix, -+ const simdvector& v) -+{ -+ simdscalar m; -+ simdscalar r0; -+ simdscalar r1; -+ -+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] -+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) -+ result[0] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] -+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) -+ result[1] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] -+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) -+ result[2] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] -+ result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) -+} -+ -+INLINE -+void _simd_mat4x3_vec3_w1_multiply( -+ simdvector& result, -+ const float *pMatrix, -+ const simdvector& v) -+{ -+ simdscalar m; -+ simdscalar r0; -+ simdscalar r1; -+ -+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] -+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) -+ result[0] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] -+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) -+ result[1] = r0; -+ -+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] -+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) -+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] -+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) -+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] -+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) -+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) -+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] -+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) -+ result[2] = r0; -+ result[3] = _simd_set1_ps(1.0f); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Compute plane equation vA * vX + vB * vY + vC -+INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY) -+{ -+ simdscalar vOut = _simd_fmadd_ps(vA, vX, vC); -+ vOut = _simd_fmadd_ps(vB, vY, vOut); -+ return vOut; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Interpolates a single component. -+/// @param vI - barycentric I -+/// @param vJ - barycentric J -+/// @param pInterpBuffer - pointer to attribute barycentric coeffs -+template -+static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer) -+{ -+ const float *pInterpA = &pInterpBuffer[Attrib * 12 + 0 + Comp]; -+ const float *pInterpB = &pInterpBuffer[Attrib * 12 + 4 + Comp]; -+ const float *pInterpC = &pInterpBuffer[Attrib * 12 + 8 + Comp]; -+ -+ simdscalar vA = _simd_broadcast_ss(pInterpA); -+ simdscalar vB = _simd_broadcast_ss(pInterpB); -+ simdscalar vC = _simd_broadcast_ss(pInterpC); -+ -+ simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ); -+ vC = _simd_mul_ps(vk, vC); -+ -+ return vplaneps(vA, vB, vC, vI, vJ); -+} -+ -+ -+#endif//__SWR_SIMDINTRIN_H__ -diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp -new file mode 100644 -index 0000000..8f176e1 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp -@@ -0,0 +1,141 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+****************************************************************************/ -+ -+#include "common/os.h" -+#include -+#include -+#include -+ -+#if defined(SWR_ENABLE_ASSERTS) -+ -+#if defined(_WIN32) -+#pragma comment(lib, "user32.lib") -+#endif // _WIN32 -+ -+bool SwrAssert( -+ bool& enabled, -+ const char* pExpression, -+ const char* pFileName, -+ uint32_t lineNum, -+ const char* pFmtString /* = nullptr */, -+ ...) -+{ -+ if (!enabled) return false; -+ -+#if defined(_WIN32) -+ static const int MAX_MESSAGE_LEN = 2048; -+ char msgBuf[MAX_MESSAGE_LEN]; -+ -+ sprintf_s(msgBuf, "%s(%d): assert: %s\n", pFileName, lineNum, pExpression); -+ msgBuf[MAX_MESSAGE_LEN - 2] = '\n'; -+ msgBuf[MAX_MESSAGE_LEN - 1] = 0; -+ OutputDebugStringA(msgBuf); -+ -+ int offset = 0; -+ -+ if (pFmtString) -+ { -+ va_list args; -+ va_start(args, pFmtString); -+ offset = _vsnprintf_s( -+ msgBuf, -+ sizeof(msgBuf), -+ sizeof(msgBuf), -+ pFmtString, -+ args); -+ va_end(args); -+ -+ if (offset < 0) { return true; } -+ -+ OutputDebugStringA("\t"); -+ OutputDebugStringA(msgBuf); -+ OutputDebugStringA("\n"); -+ } -+ -+ if (KNOB_ENABLE_ASSERT_DIALOGS) -+ { -+ int retval = sprintf_s( -+ &msgBuf[offset], -+ MAX_MESSAGE_LEN - offset, -+ "\n\n" -+ "File: %s\n" -+ "Line: %d\n" -+ "\n" -+ "Expression: %s\n\n" -+ "Cancel: Disable this assert for the remainder of the process\n" -+ "Try Again: Break into the debugger\n" -+ "Continue: Continue execution (but leave assert enabled)", -+ pFileName, -+ lineNum, -+ pExpression); -+ -+ if (retval < 0) { return true; } -+ -+ offset += retval; -+ -+ if (!IsDebuggerPresent()) -+ { -+ sprintf_s( -+ &msgBuf[offset], -+ MAX_MESSAGE_LEN - offset, -+ "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a program crash!"); -+ } -+ -+ retval = MessageBoxA(nullptr, msgBuf, "Assert Failed", MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION); -+ -+ switch (retval) -+ { -+ case IDCANCEL: -+ enabled = false; -+ return false; -+ -+ case IDTRYAGAIN: -+ return true; -+ -+ case IDCONTINUE: -+ return false; -+ } -+ } -+ else -+ { -+ return 0 != IsDebuggerPresent(); -+ } -+ -+#else // !_WIN32 -+ fprintf(stderr, "%s(%d): assert: %s\n", pFileName, lineNum, pExpression); -+ if (pFmtString) -+ { -+ va_list args; -+ va_start(args, pFmtString); -+ vfprintf(stderr, pFmtString, args); -+ va_end(args); -+ } -+ fflush(stderr); -+ -+ /// @todo - Implement message box on non-Windows platforms -+ -+#endif -+ return true; -+} -+ -+#endif // SWR_ENABLE_ASSERTS -diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h -new file mode 100644 -index 0000000..afc9f59 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h -@@ -0,0 +1,84 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+****************************************************************************/ -+ -+#ifndef __SWR_ASSERT_H__ -+#define __SWR_ASSERT_H__ -+ -+#if !defined(__SWR_OS_H__) -+#error swr_assert.h should not be included directly, please include "common/os.h" instead. -+#endif -+ -+#if !defined(SWR_ENABLE_ASSERTS) -+ -+#if !defined(NDEBUG) -+#define SWR_ENABLE_ASSERTS 1 -+#else -+#define SWR_ENABLE_ASSERTS 0 -+#endif // _DEBUG -+ -+#endif // SWR_ENABLE_ASSERTS -+ -+#if SWR_ENABLE_ASSERTS -+#include "assert.h" -+ -+#if !defined(__cplusplus) -+ -+#pragma message("C++ is required for SWR Asserts, falling back to assert.h") -+ -+#define SWR_ASSERT(e, ...) assert(e) -+ -+#else -+ -+#if defined(assert) -+#undef assert -+#endif -+#define assert(exp) SWR_ASSERT(exp) -+ -+bool SwrAssert( -+ bool& enabled, -+ const char* pExpression, -+ const char* pFileName, -+ uint32_t lineNum, -+ const char* pFmtString = nullptr, -+ ...); -+ -+#define SWR_ASSERT(e, ...) {\ -+ bool expFailed = !(e);\ -+ if (expFailed) {\ -+ static bool swrAssertEnabled = true;\ -+ expFailed = SwrAssert(swrAssertEnabled, #e, __FILE__, __LINE__, ##__VA_ARGS__);\ -+ if (expFailed) { DEBUGBREAK; }\ -+ }\ -+} -+ -+#endif // C++ -+ -+#else // No asserts enabled -+ -+#define SWR_ASSERT(e, ...) {} -+ -+#endif -+ -+#define SWR_NOT_IMPL SWR_ASSERT(0, "%s not implemented", __FUNCTION__) -+ -+#endif//__SWR_OS_H__ -diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp -new file mode 100644 -index 0000000..1081e28 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp -@@ -0,0 +1,1461 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file api.cpp -+* -+* @brief API implementation -+* -+******************************************************************************/ -+ -+#include -+#include -+#include -+ -+#if defined(__gnu_linux__) || defined(__linux__) -+#include -+#endif -+ -+#include "core/api.h" -+#include "core/backend.h" -+#include "core/context.h" -+#include "core/frontend.h" -+#include "core/rasterizer.h" -+#include "core/rdtsc_core.h" -+#include "core/threads.h" -+#include "core/tilemgr.h" -+#include "core/clip.h" -+ -+#include "common/simdintrin.h" -+#include "common/os.h" -+ -+void SetupDefaultState(SWR_CONTEXT *pContext); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Create SWR Context. -+/// @param pCreateInfo - pointer to creation info. -+HANDLE SwrCreateContext( -+ const SWR_CREATECONTEXT_INFO* pCreateInfo) -+{ -+ RDTSC_RESET(); -+ RDTSC_INIT(0); -+ -+ void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4); -+ memset(pContextMem, 0, sizeof(SWR_CONTEXT)); -+ SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT(); -+ -+ pContext->driverType = pCreateInfo->driver; -+ pContext->privateStateSize = pCreateInfo->privateStateSize; -+ -+ pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); -+ memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT); -+ -+ pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); -+ memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT); -+ -+ for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) -+ { -+ pContext->dcRing[dc].arena.Init(); -+ pContext->dcRing[dc].inUse = false; -+ pContext->dcRing[dc].pTileMgr = new MacroTileMgr(); -+ pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. -+ -+ pContext->dsRing[dc].arena.Init(); -+ } -+ -+ if (!KNOB_SINGLE_THREADED) -+ { -+ memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); -+ memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); -+ new (&pContext->WaitLock) std::mutex(); -+ new (&pContext->FifosNotEmpty) std::condition_variable(); -+ -+ CreateThreadPool(pContext, &pContext->threadPool); -+ } -+ -+ // Calling createThreadPool() above can set SINGLE_THREADED -+ if (KNOB_SINGLE_THREADED) -+ { -+ pContext->NumWorkerThreads = 1; -+ } -+ -+ // Allocate scratch space for workers. -+ ///@note We could lazily allocate this but its rather small amount of memory. -+ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) -+ { -+ ///@todo Use numa API for allocations using numa information from thread data (if exists). -+ pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); -+ } -+ -+ pContext->LastRetiredId = 0; -+ pContext->nextDrawId = 1; -+ -+ // workers start at draw 1 -+ for (uint32_t i = 0; i < KNOB_MAX_NUM_THREADS; ++i) -+ { -+ pContext->WorkerFE[i] = 1; -+ pContext->WorkerBE[i] = 1; -+ } -+ -+ pContext->DrawEnqueued = 1; -+ -+ // State setup AFTER context is fully initialized -+ SetupDefaultState(pContext); -+ -+ // initialize hot tile manager -+ pContext->pHotTileMgr = new HotTileMgr(); -+ -+ // initialize function pointer tables -+ InitClearTilesTable(); -+ -+ // initialize store tiles function -+ pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; -+ pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; -+ pContext->pfnClearTile = pCreateInfo->pfnClearTile; -+ -+ return (HANDLE)pContext; -+} -+ -+void SwrDestroyContext(HANDLE hContext) -+{ -+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; -+ DestroyThreadPool(pContext, &pContext->threadPool); -+ -+ // free the fifos -+ for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) -+ { -+ delete(pContext->dcRing[i].pTileMgr); -+ delete(pContext->dcRing[i].pDispatch); -+ } -+ -+ // Free scratch space. -+ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) -+ { -+ _aligned_free(pContext->pScratch[i]); -+ } -+ -+ _aligned_free(pContext->dcRing); -+ _aligned_free(pContext->dsRing); -+ -+ delete(pContext->pHotTileMgr); -+ -+ pContext->~SWR_CONTEXT(); -+ _aligned_free((SWR_CONTEXT*)hContext); -+} -+ -+void WakeAllThreads(SWR_CONTEXT *pContext) -+{ -+ std::unique_lock lock(pContext->WaitLock); -+ pContext->FifosNotEmpty.notify_all(); -+ lock.unlock(); -+} -+ -+bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC) -+{ -+ // For single thread nothing should still be drawing. -+ if (KNOB_SINGLE_THREADED) { return false; } -+ -+ if (pDC->isCompute) -+ { -+ if (pDC->doneCompute) -+ { -+ pDC->inUse = false; -+ return false; -+ } -+ } -+ -+ // Check if backend work is done. First make sure all triangles have been binned. -+ if (pDC->doneFE == true) -+ { -+ // ensure workers have all moved passed this draw -+ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) -+ { -+ if (pContext->WorkerFE[i] <= pDC->drawId) -+ { -+ return true; -+ } -+ -+ if (pContext->WorkerBE[i] <= pDC->drawId) -+ { -+ return true; -+ } -+ } -+ -+ pDC->inUse = false; // all work is done. -+ } -+ -+ return pDC->inUse; -+} -+ -+void UpdateLastRetiredId(SWR_CONTEXT *pContext) -+{ -+ uint64_t head = pContext->LastRetiredId + 1; -+ uint64_t tail = pContext->DrawEnqueued; -+ -+ // There's no guarantee the DRAW_CONTEXT associated with (LastRetiredId+1) is still valid. -+ // This is because the update to LastRetiredId can fall behind causing the range from LastRetiredId -+ // to DrawEnqueued to exceed the size of the DRAW_CONTEXT ring. Check for this and manually increment -+ // the head to the oldest entry of the DRAW_CONTEXT ring -+ if ((tail - head) > KNOB_MAX_DRAWS_IN_FLIGHT - 1) -+ { -+ head = tail - KNOB_MAX_DRAWS_IN_FLIGHT + 1; -+ } -+ -+ DRAW_CONTEXT *pDC = &pContext->dcRing[head % KNOB_MAX_DRAWS_IN_FLIGHT]; -+ while ((head < tail) && !StillDrawing(pContext, pDC)) -+ { -+ pContext->LastRetiredId = pDC->drawId; -+ head++; -+ pDC = &pContext->dcRing[head % KNOB_MAX_DRAWS_IN_FLIGHT]; -+ } -+} -+ -+void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId) -+{ -+ if (!KNOB_SINGLE_THREADED) -+ { -+ while (drawId > pContext->LastRetiredId) -+ { -+ WakeAllThreads(pContext); -+ UpdateLastRetiredId(pContext); -+ } -+ } -+} -+ -+void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) -+{ -+ memcpy(&dst.state, &src.state, sizeof(API_STATE)); -+} -+ -+void QueueDraw(SWR_CONTEXT *pContext) -+{ -+ _ReadWriteBarrier(); -+ pContext->DrawEnqueued ++; -+ -+ if (KNOB_SINGLE_THREADED) -+ { -+ // flush denormals to 0 -+ uint32_t mxcsr = _mm_getcsr(); -+ _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); -+ -+ std::unordered_set lockedTiles; -+ WorkOnFifoFE(pContext, 0, pContext->WorkerFE[0], 0); -+ WorkOnFifoBE(pContext, 0, pContext->WorkerBE[0], lockedTiles); -+ -+ // restore csr -+ _mm_setcsr(mxcsr); -+ } -+ else -+ { -+ RDTSC_START(APIDrawWakeAllThreads); -+ WakeAllThreads(pContext); -+ RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); -+ } -+ -+ // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. -+ pContext->pPrevDrawContext = pContext->pCurDrawContext; -+ pContext->pCurDrawContext = nullptr; -+} -+ -+///@todo Combine this with QueueDraw -+void QueueDispatch(SWR_CONTEXT *pContext) -+{ -+ _ReadWriteBarrier(); -+ pContext->DrawEnqueued++; -+ -+ if (KNOB_SINGLE_THREADED) -+ { -+ // flush denormals to 0 -+ uint32_t mxcsr = _mm_getcsr(); -+ _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); -+ -+ WorkOnCompute(pContext, 0, pContext->WorkerBE[0]); -+ -+ // restore csr -+ _mm_setcsr(mxcsr); -+ } -+ else -+ { -+ RDTSC_START(APIDrawWakeAllThreads); -+ WakeAllThreads(pContext); -+ RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); -+ } -+ -+ // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. -+ pContext->pPrevDrawContext = pContext->pCurDrawContext; -+ pContext->pCurDrawContext = nullptr; -+} -+ -+DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) -+{ -+ RDTSC_START(APIGetDrawContext); -+ // If current draw context is null then need to obtain a new draw context to use from ring. -+ if (pContext->pCurDrawContext == nullptr) -+ { -+ uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; -+ -+ DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; -+ pContext->pCurDrawContext = pCurDrawContext; -+ -+ // Update LastRetiredId -+ UpdateLastRetiredId(pContext); -+ -+ // Need to wait until this draw context is available to use. -+ while (StillDrawing(pContext, pCurDrawContext)) -+ { -+ // Make sure workers are working. -+ WakeAllThreads(pContext); -+ -+ _mm_pause(); -+ } -+ -+ // Assign next available entry in DS ring to this DC. -+ uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; -+ pCurDrawContext->pState = &pContext->dsRing[dsIndex]; -+ -+ Arena& stateArena = pCurDrawContext->pState->arena; -+ -+ // Copy previous state to current state. -+ if (pContext->pPrevDrawContext) -+ { -+ DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext; -+ -+ // If we're splitting our draw then we can just use the same state from the previous -+ // draw. In this case, we won't increment the DS ring index so the next non-split -+ // draw can receive the state. -+ if (isSplitDraw == false) -+ { -+ CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); -+ -+ stateArena.Reset(); // Reset memory. -+ -+ // Copy private state to new context. -+ if (pPrevDrawContext->pState->pPrivateState != nullptr) -+ { -+ pCurDrawContext->pState->pPrivateState = stateArena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); -+ memcpy(pCurDrawContext->pState->pPrivateState, pPrevDrawContext->pState->pPrivateState, pContext->privateStateSize); -+ } -+ -+ pContext->curStateId++; // Progress state ring index forward. -+ } -+ else -+ { -+ // If its a split draw then just copy the state pointer over -+ // since its the same draw. -+ pCurDrawContext->pState = pPrevDrawContext->pState; -+ } -+ } -+ else -+ { -+ stateArena.Reset(); // Reset memory. -+ pContext->curStateId++; // Progress state ring index forward. -+ } -+ -+ pCurDrawContext->dependency = 0; -+ pCurDrawContext->arena.Reset(); -+ pCurDrawContext->pContext = pContext; -+ pCurDrawContext->isCompute = false; // Dispatch has to set this to true. -+ pCurDrawContext->inUse = false; -+ -+ pCurDrawContext->doneCompute = false; -+ pCurDrawContext->doneFE = false; -+ pCurDrawContext->FeLock = 0; -+ -+ pCurDrawContext->pTileMgr->initialize(); -+ -+ // Assign unique drawId for this DC -+ pCurDrawContext->drawId = pContext->nextDrawId++; -+ } -+ else -+ { -+ SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); -+ } -+ -+ RDTSC_STOP(APIGetDrawContext, 0, 0); -+ return pContext->pCurDrawContext; -+} -+ -+API_STATE* GetDrawState(SWR_CONTEXT *pContext) -+{ -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ SWR_ASSERT(pDC->pState != nullptr); -+ -+ return &pDC->pState->state; -+} -+ -+void SetupDefaultState(SWR_CONTEXT *pContext) -+{ -+ API_STATE* pState = GetDrawState(pContext); -+ -+ pState->rastState.cullMode = SWR_CULLMODE_NONE; -+ pState->rastState.frontWinding = SWR_FRONTWINDING_CCW; -+} -+ -+static INLINE SWR_CONTEXT* GetContext(HANDLE hContext) -+{ -+ return (SWR_CONTEXT*)hContext; -+} -+ -+void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2) -+{ -+ RDTSC_START(APISync); -+ -+ SWR_CONTEXT *pContext = GetContext(hContext); -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ -+ pDC->inUse = true; -+ -+ pDC->FeWork.type = SYNC; -+ pDC->FeWork.pfnWork = ProcessSync; -+ pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc; -+ pDC->FeWork.desc.sync.userData = userData; -+ pDC->FeWork.desc.sync.userData2 = userData2; -+ -+ // cannot execute until all previous draws have completed -+ pDC->dependency = pDC->drawId - 1; -+ -+ //enqueue -+ QueueDraw(pContext); -+ -+ RDTSC_STOP(APISync, 1, 0); -+} -+ -+void SwrWaitForIdle(HANDLE hContext) -+{ -+ SWR_CONTEXT *pContext = GetContext(hContext); -+ -+ // Wait on the previous DrawContext's drawId, as this function doesn't queue anything. -+ if (pContext->pPrevDrawContext) -+ WaitForDependencies(pContext, pContext->pPrevDrawContext->drawId); -+} -+ -+void SwrSetVertexBuffers( -+ HANDLE hContext, -+ uint32_t numBuffers, -+ const SWR_VERTEX_BUFFER_STATE* pVertexBuffers) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ for (uint32_t i = 0; i < numBuffers; ++i) -+ { -+ const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i]; -+ pState->vertexBuffers[pVB->index] = *pVB; -+ } -+} -+ -+void SwrSetIndexBuffer( -+ HANDLE hContext, -+ const SWR_INDEX_BUFFER_STATE* pIndexBuffer) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ pState->indexBuffer = *pIndexBuffer; -+} -+ -+void SwrSetFetchFunc( -+ HANDLE hContext, -+ PFN_FETCH_FUNC pfnFetchFunc) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ pState->pfnFetchFunc = pfnFetchFunc; -+} -+ -+void SwrSetSoFunc( -+ HANDLE hContext, -+ PFN_SO_FUNC pfnSoFunc, -+ uint32_t streamIndex) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ SWR_ASSERT(streamIndex < MAX_SO_STREAMS); -+ -+ pState->pfnSoFunc[streamIndex] = pfnSoFunc; -+} -+ -+void SwrSetSoState( -+ HANDLE hContext, -+ SWR_STREAMOUT_STATE* pSoState) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ pState->soState = *pSoState; -+} -+ -+void SwrSetSoBuffers( -+ HANDLE hContext, -+ SWR_STREAMOUT_BUFFER* pSoBuffer, -+ uint32_t slot) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot); -+ -+ pState->soBuffer[slot] = *pSoBuffer; -+} -+ -+void SwrSetVertexFunc( -+ HANDLE hContext, -+ PFN_VERTEX_FUNC pfnVertexFunc) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ pState->pfnVertexFunc = pfnVertexFunc; -+} -+ -+void SwrSetFrontendState( -+ HANDLE hContext, -+ SWR_FRONTEND_STATE *pFEState) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ pState->frontendState = *pFEState; -+} -+ -+void SwrSetGsState( -+ HANDLE hContext, -+ SWR_GS_STATE *pGSState) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ pState->gsState = *pGSState; -+} -+ -+void SwrSetGsFunc( -+ HANDLE hContext, -+ PFN_GS_FUNC pfnGsFunc) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ pState->pfnGsFunc = pfnGsFunc; -+} -+ -+void SwrSetCsFunc( -+ HANDLE hContext, -+ PFN_CS_FUNC pfnCsFunc, -+ uint32_t totalThreadsInGroup) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ pState->pfnCsFunc = pfnCsFunc; -+ pState->totalThreadsInGroup = totalThreadsInGroup; -+} -+ -+void SwrSetTsState( -+ HANDLE hContext, -+ SWR_TS_STATE *pState) -+{ -+ API_STATE* pApiState = GetDrawState(GetContext(hContext)); -+ pApiState->tsState = *pState; -+} -+ -+void SwrSetHsFunc( -+ HANDLE hContext, -+ PFN_HS_FUNC pfnFunc) -+{ -+ API_STATE* pApiState = GetDrawState(GetContext(hContext)); -+ pApiState->pfnHsFunc = pfnFunc; -+} -+ -+void SwrSetDsFunc( -+ HANDLE hContext, -+ PFN_DS_FUNC pfnFunc) -+{ -+ API_STATE* pApiState = GetDrawState(GetContext(hContext)); -+ pApiState->pfnDsFunc = pfnFunc; -+} -+ -+void SwrSetDepthStencilState( -+ HANDLE hContext, -+ SWR_DEPTH_STENCIL_STATE *pDSState) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ pState->depthStencilState = *pDSState; -+} -+ -+void SwrSetBackendState( -+ HANDLE hContext, -+ SWR_BACKEND_STATE *pBEState) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ pState->backendState = *pBEState; -+} -+ -+void SwrSetPixelShaderState( -+ HANDLE hContext, -+ SWR_PS_STATE *pPSState) -+{ -+ API_STATE *pState = GetDrawState(GetContext(hContext)); -+ pState->psState = *pPSState; -+} -+ -+void SwrSetBlendState( -+ HANDLE hContext, -+ SWR_BLEND_STATE *pBlendState) -+{ -+ API_STATE *pState = GetDrawState(GetContext(hContext)); -+ memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE)); -+} -+ -+void SwrSetBlendFunc( -+ HANDLE hContext, -+ uint32_t renderTarget, -+ PFN_BLEND_JIT_FUNC pfnBlendFunc) -+{ -+ SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS); -+ API_STATE *pState = GetDrawState(GetContext(hContext)); -+ pState->pfnBlendFunc[renderTarget] = pfnBlendFunc; -+} -+ -+void SwrSetLinkage( -+ HANDLE hContext, -+ uint32_t mask, -+ const uint8_t* pMap) -+{ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ -+ static const uint8_t IDENTITY_MAP[] = -+ { -+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, -+ }; -+ static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap), -+ "Update for new value of MAX_ATTRIBUTES"); -+ -+ pState->linkageMask = mask; -+ pState->linkageCount = _mm_popcnt_u32(mask); -+ -+ if (!pMap) -+ { -+ pMap = IDENTITY_MAP; -+ } -+ memcpy(pState->linkageMap, pMap, pState->linkageCount); -+} -+ -+// update guardband multipliers for the viewport -+void updateGuardband(API_STATE *pState) -+{ -+ // guardband center is viewport center -+ pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width; -+ pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width; -+ pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height; -+ pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height; -+} -+ -+void SwrSetRastState( -+ HANDLE hContext, -+ const SWR_RASTSTATE *pRastState) -+{ -+ SWR_CONTEXT *pContext = GetContext(hContext); -+ API_STATE* pState = GetDrawState(pContext); -+ -+ memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE)); -+} -+ -+void SwrSetViewports( -+ HANDLE hContext, -+ uint32_t numViewports, -+ const SWR_VIEWPORT* pViewports, -+ const SWR_VIEWPORT_MATRIX* pMatrices) -+{ -+ SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, -+ "Invalid number of viewports."); -+ -+ SWR_CONTEXT *pContext = GetContext(hContext); -+ API_STATE* pState = GetDrawState(pContext); -+ -+ memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports); -+ -+ if (pMatrices != nullptr) -+ { -+ memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports); -+ } -+ else -+ { -+ // Compute default viewport transform. -+ for (uint32_t i = 0; i < numViewports; ++i) -+ { -+ if (pContext->driverType == DX) -+ { -+ pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f; -+ pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f; -+ pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ; -+ pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; -+ pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11; -+ pState->vpMatrix[i].m32 = pState->vp[i].minZ; -+ } -+ else -+ { -+ // Standard, with the exception that Y is inverted. -+ pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f; -+ pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f; -+ pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f; -+ pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; -+ pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11; -+ pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22; -+ -+ // Now that the matrix is calculated, clip the view coords to screen size. -+ // OpenGL allows for -ve x,y in the viewport. -+ pState->vp[i].x = std::max(pState->vp[i].x, 0.0f); -+ pState->vp[i].y = std::max(pState->vp[i].y, 0.0f); -+ } -+ } -+ } -+ -+ updateGuardband(pState); -+} -+ -+void SwrSetScissorRects( -+ HANDLE hContext, -+ uint32_t numScissors, -+ const BBOX* pScissors) -+{ -+ SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, -+ "Invalid number of scissor rects."); -+ -+ API_STATE* pState = GetDrawState(GetContext(hContext)); -+ memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX)); -+}; -+ -+void SetupMacroTileScissors(DRAW_CONTEXT *pDC) -+{ -+ API_STATE *pState = &pDC->pState->state; -+ uint32_t left, right, top, bottom; -+ -+ // Set up scissor dimensions based on scissor or viewport -+ if (pState->rastState.scissorEnable) -+ { -+ // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges -+ left = pState->scissorRects[0].left; -+ right = pState->scissorRects[0].right; -+ top = pState->scissorRects[0].top; -+ bottom = pState->scissorRects[0].bottom; -+ } -+ else -+ { -+ left = (int32_t)pState->vp[0].x; -+ right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width; -+ top = (int32_t)pState->vp[0].y; -+ bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height; -+ } -+ -+ pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE; -+ pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1; -+ pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE; -+ pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1; -+} -+ -+void SetupPipeline(DRAW_CONTEXT *pDC) -+{ -+ DRAW_STATE* pState = pDC->pState; -+ -+ // setup backend -+ if (pState->state.psState.pfnPixelShader == nullptr) -+ { -+ pState->pfnBackend = &BackendNullPS; -+ } -+ else -+ { -+ bool bMultisampleEnable = (pState->state.rastState.sampleCount > SWR_MULTISAMPLE_1X) ? 1 : 0; -+ -+ // select backend function based on max slot used by PS -+ switch(pState->state.psState.shadingRate) -+ { -+ case SWR_SHADING_RATE_PIXEL: -+ if(bMultisampleEnable) -+ { -+ pState->pfnBackend = gPixelRateBackendTable[pState->state.rastState.sampleCount-1][pState->state.psState.maxRTSlotUsed]; -+ } -+ else -+ { -+ pState->pfnBackend = gSingleSampleBackendTable[pState->state.psState.maxRTSlotUsed]; -+ } -+ break; -+ case SWR_SHADING_RATE_SAMPLE: -+ ///@todo Do we need to obey sample rate -+ if (!bMultisampleEnable) -+ { -+ // If PS is set at per sample rate and multisampling is disabled, set to per pixel and single sample backend -+ pState->state.psState.shadingRate = SWR_SHADING_RATE_PIXEL; -+ pState->pfnBackend = gSingleSampleBackendTable[pState->state.psState.maxRTSlotUsed]; -+ } -+ else -+ { -+ pState->pfnBackend = gSampleRateBackendTable[pState->state.rastState.sampleCount-1][pState->state.psState.maxRTSlotUsed]; -+ } -+ break; -+ case SWR_SHADING_RATE_COARSE: -+ default: -+ assert(0 && "Invalid shading rate"); -+ break; -+ } -+ } -+ -+ PFN_PROCESS_PRIMS pfnBinner; -+ switch (pState->state.topology) -+ { -+ case TOP_POINT_LIST: -+ pState->pfnProcessPrims = CanUseSimplePoints(pDC) ? ClipPoints : ClipTriangles; -+ pfnBinner = CanUseSimplePoints(pDC) ? BinPoints : BinTriangles; -+ break; -+ case TOP_LINE_LIST: -+ case TOP_LINE_STRIP: -+ case TOP_LINE_LOOP: -+ case TOP_LINE_LIST_ADJ: -+ case TOP_LISTSTRIP_ADJ: -+ pState->pfnProcessPrims = ClipLines; -+ pfnBinner = BinLines; -+ break; -+ default: -+ pState->pfnProcessPrims = ClipTriangles; -+ pfnBinner = BinTriangles; -+ break; -+ }; -+ -+ // disable clipper if viewport transform is disabled -+ if (pState->state.frontendState.vpTransformDisable) -+ { -+ pState->pfnProcessPrims = pfnBinner; -+ } -+ -+ if ((pState->state.psState.pfnPixelShader == nullptr) && -+ (pState->state.depthStencilState.depthTestEnable == FALSE) && -+ (pState->state.depthStencilState.depthWriteEnable == FALSE) && -+ (pState->state.linkageCount == 0)) -+ { -+ pState->pfnProcessPrims = nullptr; -+ pState->state.linkageMask = 0; -+ } -+ -+ if (pState->state.soState.rasterizerDisable == true) -+ { -+ pState->pfnProcessPrims = nullptr; -+ pState->state.linkageMask = 0; -+ } -+ -+ // set up the frontend attrib mask -+ pState->state.feAttribMask = pState->state.linkageMask; -+ if (pState->state.soState.soEnable) -+ { -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ pState->state.feAttribMask |= pState->state.soState.streamMasks[i]; -+ } -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief InitDraw -+/// @param pDC - Draw context to initialize for this draw. -+void InitDraw( -+ DRAW_CONTEXT *pDC, -+ bool isSplitDraw) -+{ -+ // We don't need to re-setup the scissors/pipeline state again for split draw. -+ if (isSplitDraw == false) -+ { -+ SetupMacroTileScissors(pDC); -+ SetupPipeline(pDC); -+ } -+ -+ pDC->inUse = true; // We are using this one now. -+ -+ /// @todo: remove when we send down preset sample patterns (standard or center) -+ // If multisampling is enabled, precompute float sample offsets from fixed -+ uint32_t numSamples = pDC->pState->state.rastState.sampleCount; -+ if(numSamples > SWR_MULTISAMPLE_1X) -+ { -+ static const float fixed8Scale = 1.0f/FIXED_POINT_SCALE; -+ float* pSamplePos = pDC->pState->state.samplePos; -+ SWR_MULTISAMPLE_POS(&iSamplePos)[SWR_MAX_NUM_MULTISAMPLES] = pDC->pState->state.rastState.iSamplePos; -+ -+ for(uint32_t i = 0; i < numSamples; i++) -+ { -+ *(pSamplePos++) = ((float)(iSamplePos[i].x) * fixed8Scale); -+ *(pSamplePos++) = ((float)(iSamplePos[i].y) * fixed8Scale); -+ } -+ } -+ // just test the masked off samples once per draw and use the results in the backend. -+ SWR_RASTSTATE &rastState = pDC->pState->state.rastState; -+ uint32_t sampleMask = rastState.sampleMask; -+ for(uint32_t i = 0; i < SWR_MAX_NUM_MULTISAMPLES; i++) -+ { -+ rastState.isSampleMasked[i] = !(sampleMask & 1); -+ sampleMask>>=1; -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief We can split the draw for certain topologies for better performance. -+/// @param totalVerts - Total vertices for draw -+/// @param topology - Topology used for draw -+uint32_t MaxVertsPerDraw( -+ DRAW_CONTEXT* pDC, -+ uint32_t totalVerts, -+ PRIMITIVE_TOPOLOGY topology) -+{ -+ API_STATE& state = pDC->pState->state; -+ -+ uint32_t vertsPerDraw = totalVerts; -+ -+ if (state.soState.soEnable) -+ { -+ return totalVerts; -+ } -+ -+ switch (topology) -+ { -+ case TOP_POINT_LIST: -+ case TOP_TRIANGLE_LIST: -+ vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW; -+ break; -+ -+ case TOP_PATCHLIST_1: -+ case TOP_PATCHLIST_2: -+ case TOP_PATCHLIST_3: -+ case TOP_PATCHLIST_4: -+ case TOP_PATCHLIST_5: -+ case TOP_PATCHLIST_6: -+ case TOP_PATCHLIST_7: -+ case TOP_PATCHLIST_8: -+ case TOP_PATCHLIST_9: -+ case TOP_PATCHLIST_10: -+ case TOP_PATCHLIST_11: -+ case TOP_PATCHLIST_12: -+ case TOP_PATCHLIST_13: -+ case TOP_PATCHLIST_14: -+ case TOP_PATCHLIST_15: -+ case TOP_PATCHLIST_16: -+ case TOP_PATCHLIST_17: -+ case TOP_PATCHLIST_18: -+ case TOP_PATCHLIST_19: -+ case TOP_PATCHLIST_20: -+ case TOP_PATCHLIST_21: -+ case TOP_PATCHLIST_22: -+ case TOP_PATCHLIST_23: -+ case TOP_PATCHLIST_24: -+ case TOP_PATCHLIST_25: -+ case TOP_PATCHLIST_26: -+ case TOP_PATCHLIST_27: -+ case TOP_PATCHLIST_28: -+ case TOP_PATCHLIST_29: -+ case TOP_PATCHLIST_30: -+ case TOP_PATCHLIST_31: -+ case TOP_PATCHLIST_32: -+ if (pDC->pState->state.tsState.tsEnable) -+ { -+ uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE; -+ vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW; -+ } -+ break; -+ -+ default: -+ // We are not splitting up draws for other topologies. -+ break; -+ } -+ -+ return vertsPerDraw; -+} -+ -+// Recursive template used to auto-nest conditionals. Converts dynamic boolean function -+// arguments to static template arguments. -+template -+struct FEDrawChooser -+{ -+ // Last Arg Terminator -+ static PFN_FE_WORK_FUNC GetFunc(bool bArg) -+ { -+ if (bArg) -+ { -+ return ProcessDraw; -+ } -+ -+ return ProcessDraw; -+ } -+ -+ // Recursively parse args -+ template -+ static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs) -+ { -+ if (bArg) -+ { -+ return FEDrawChooser::GetFunc(remainingArgs...); -+ } -+ -+ return FEDrawChooser::GetFunc(remainingArgs...); -+ } -+}; -+ -+// Selector for correct templated Draw front-end function -+INLINE -+static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled) -+{ -+ return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled); -+} -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief DrawInstanced -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param numVerts - How many vertices to read sequentially from vertex data (per instance). -+/// @param startVertex - Specifies start vertex for draw. (vertex data) -+/// @param numInstances - How many instances to render. -+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -+void DrawInstanced( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t numVertices, -+ uint32_t startVertex, -+ uint32_t numInstances = 1, -+ uint32_t startInstance = 0) -+{ -+ RDTSC_START(APIDraw); -+ -+#if KNOB_ENABLE_TOSS_POINTS -+ if (KNOB_TOSS_DRAW) -+ { -+ return; -+ } -+#endif -+ -+ SWR_CONTEXT *pContext = GetContext(hContext); -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ -+ int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); -+ uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); -+ int32_t remainingVerts = numVertices; -+ -+ API_STATE *pState = &pDC->pState->state; -+ pState->topology = topology; -+ pState->forceFront = false; -+ -+ // disable culling for points/lines -+ uint32_t oldCullMode = pState->rastState.cullMode; -+ if (topology == TOP_POINT_LIST) -+ { -+ pState->rastState.cullMode = SWR_CULLMODE_NONE; -+ pState->forceFront = true; -+ } -+ -+ int draw = 0; -+ while (remainingVerts) -+ { -+ uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ? -+ remainingVerts : maxVertsPerDraw; -+ -+ bool isSplitDraw = (draw > 0) ? true : false; -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); -+ InitDraw(pDC, isSplitDraw); -+ -+ pDC->FeWork.type = DRAW; -+ pDC->FeWork.pfnWork = GetFEDrawFunc( -+ false, // IsIndexed -+ pState->tsState.tsEnable, -+ pState->gsState.gsEnable, -+ pState->soState.soEnable, -+ pDC->pState->pfnProcessPrims != nullptr); -+ pDC->FeWork.desc.draw.numVerts = numVertsForDraw; -+ pDC->FeWork.desc.draw.startVertex = startVertex + draw * maxVertsPerDraw; -+ pDC->FeWork.desc.draw.numInstances = numInstances; -+ pDC->FeWork.desc.draw.startInstance = startInstance; -+ pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; -+ -+ //enqueue DC -+ QueueDraw(pContext); -+ -+ remainingVerts -= numVertsForDraw; -+ draw++; -+ } -+ -+ // restore culling state -+ pDC = GetDrawContext(pContext); -+ pDC->pState->state.rastState.cullMode = oldCullMode; -+ -+ RDTSC_STOP(APIDraw, numVertices * numInstances, 0); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrDraw -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param startVertex - Specifies start vertex in vertex buffer for draw. -+/// @param primCount - Number of vertices. -+void SwrDraw( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t startVertex, -+ uint32_t numVertices) -+{ -+ DrawInstanced(hContext, topology, numVertices, startVertex); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrDrawInstanced -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. -+/// @param numInstances - How many instances to render. -+/// @param startVertex - Specifies start vertex for draw. (vertex data) -+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -+void SwrDrawInstanced( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t numVertsPerInstance, -+ uint32_t numInstances, -+ uint32_t startVertex, -+ uint32_t startInstance -+ ) -+{ -+ DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief DrawIndexedInstanced -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param numIndices - Number of indices to read sequentially from index buffer. -+/// @param indexOffset - Starting index into index buffer. -+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -+/// @param numInstances - Number of instances to render. -+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -+void DrawIndexedInstance( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t numIndices, -+ uint32_t indexOffset, -+ int32_t baseVertex, -+ uint32_t numInstances = 1, -+ uint32_t startInstance = 0) -+{ -+ RDTSC_START(APIDrawIndexed); -+ -+ SWR_CONTEXT *pContext = GetContext(hContext); -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ API_STATE* pState = &pDC->pState->state; -+ -+ int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); -+ uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); -+ int32_t remainingIndices = numIndices; -+ -+ uint32_t indexSize = 0; -+ switch (pState->indexBuffer.format) -+ { -+ case R32_UINT: indexSize = sizeof(uint32_t); break; -+ case R16_UINT: indexSize = sizeof(uint16_t); break; -+ case R8_UINT: indexSize = sizeof(uint8_t); break; -+ default: -+ SWR_ASSERT(0); -+ } -+ -+ int draw = 0; -+ uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices; -+ pIB += (uint64_t)indexOffset * (uint64_t)indexSize; -+ -+ pState->topology = topology; -+ pState->forceFront = false; -+ -+ // disable culling for points/lines -+ uint32_t oldCullMode = pState->rastState.cullMode; -+ if (topology == TOP_POINT_LIST) -+ { -+ pState->rastState.cullMode = SWR_CULLMODE_NONE; -+ pState->forceFront = true; -+ } -+ -+ while (remainingIndices) -+ { -+ uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ? -+ remainingIndices : maxIndicesPerDraw; -+ -+ // When breaking up draw, we need to obtain new draw context for each iteration. -+ bool isSplitDraw = (draw > 0) ? true : false; -+ pDC = GetDrawContext(pContext, isSplitDraw); -+ InitDraw(pDC, isSplitDraw); -+ -+ pDC->FeWork.type = DRAW; -+ pDC->FeWork.pfnWork = GetFEDrawFunc( -+ true, // IsIndexed -+ pState->tsState.tsEnable, -+ pState->gsState.gsEnable, -+ pState->soState.soEnable, -+ pDC->pState->pfnProcessPrims != nullptr); -+ pDC->FeWork.desc.draw.pDC = pDC; -+ pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; -+ pDC->FeWork.desc.draw.pIB = (int*)pIB; -+ pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; -+ -+ pDC->FeWork.desc.draw.numInstances = numInstances; -+ pDC->FeWork.desc.draw.startInstance = startInstance; -+ pDC->FeWork.desc.draw.baseVertex = baseVertex; -+ pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; -+ -+ //enqueue DC -+ QueueDraw(pContext); -+ -+ pIB += maxIndicesPerDraw * indexSize; -+ remainingIndices -= numIndicesForDraw; -+ draw++; -+ } -+ -+ // restore culling state -+ pDC = GetDrawContext(pContext); -+ pDC->pState->state.rastState.cullMode = oldCullMode; -+ -+ RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0); -+} -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief DrawIndexed -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param numIndices - Number of indices to read sequentially from index buffer. -+/// @param indexOffset - Starting index into index buffer. -+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -+void SwrDrawIndexed( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t numIndices, -+ uint32_t indexOffset, -+ int32_t baseVertex -+ ) -+{ -+ DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrDrawIndexedInstanced -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param numIndices - Number of indices to read sequentially from index buffer. -+/// @param numInstances - Number of instances to render. -+/// @param indexOffset - Starting index into index buffer. -+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -+void SwrDrawIndexedInstanced( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t numIndices, -+ uint32_t numInstances, -+ uint32_t indexOffset, -+ int32_t baseVertex, -+ uint32_t startInstance) -+{ -+ DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); -+} -+ -+// Attach surfaces to pipeline -+void SwrInvalidateTiles( -+ HANDLE hContext, -+ uint32_t attachmentMask) -+{ -+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ pDC->inUse = true; -+ -+ // Queue a load to the hottile -+ pDC->FeWork.type = INVALIDATETILES; -+ pDC->FeWork.pfnWork = ProcessInvalidateTiles; -+ pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask; -+ -+ //enqueue -+ QueueDraw(pContext); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrDispatch -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param threadGroupCountX - Number of thread groups dispatched in X direction -+/// @param threadGroupCountY - Number of thread groups dispatched in Y direction -+/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction -+void SwrDispatch( -+ HANDLE hContext, -+ uint32_t threadGroupCountX, -+ uint32_t threadGroupCountY, -+ uint32_t threadGroupCountZ) -+{ -+ RDTSC_START(APIDispatch); -+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ -+ pDC->isCompute = true; // This is a compute context. -+ pDC->inUse = true; -+ -+ COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->arena.AllocAligned(sizeof(COMPUTE_DESC), 64); -+ -+ pTaskData->threadGroupCountX = threadGroupCountX; -+ pTaskData->threadGroupCountY = threadGroupCountY; -+ pTaskData->threadGroupCountZ = threadGroupCountZ; -+ -+ uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; -+ pDC->pDispatch->initialize(totalThreadGroups, pTaskData); -+ -+ QueueDispatch(pContext); -+ RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0); -+} -+ -+// Deswizzles, converts and stores current contents of the hot tiles to surface -+// described by pState -+void SwrStoreTiles( -+ HANDLE hContext, -+ SWR_RENDERTARGET_ATTACHMENT attachment, -+ SWR_TILE_STATE postStoreTileState) // TODO: Implement postStoreTileState -+{ -+ RDTSC_START(APIStoreTiles); -+ -+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ pDC->inUse = true; -+ -+ SetupMacroTileScissors(pDC); -+ -+ pDC->FeWork.type = STORETILES; -+ pDC->FeWork.pfnWork = ProcessStoreTiles; -+ pDC->FeWork.desc.storeTiles.attachment = attachment; -+ pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; -+ -+ //enqueue -+ QueueDraw(pContext); -+ -+ RDTSC_STOP(APIStoreTiles, 0, 0); -+ if (attachment == SWR_ATTACHMENT_COLOR0) -+ { -+ RDTSC_ENDFRAME(); -+ } -+} -+ -+void SwrClearRenderTarget( -+ HANDLE hContext, -+ uint32_t clearMask, -+ const float clearColor[4], -+ float z, -+ BYTE stencil) -+{ -+ RDTSC_START(APIClearRenderTarget); -+ -+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; -+ -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ -+ SetupMacroTileScissors(pDC); -+ -+ pDC->inUse = true; -+ -+ CLEAR_FLAGS flags; -+ flags.mask = clearMask; -+ -+ pDC->FeWork.type = CLEAR; -+ pDC->FeWork.pfnWork = ProcessClear; -+ pDC->FeWork.desc.clear.flags = flags; -+ pDC->FeWork.desc.clear.clearDepth = z; -+ pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; -+ pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; -+ pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; -+ pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; -+ pDC->FeWork.desc.clear.clearStencil = stencil; -+ -+ // enqueue draw -+ QueueDraw(pContext); -+ -+ RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Returns a pointer to the private context state for the current -+/// draw operation. This is used for external componets such as the -+/// sampler. -+/// SWR is responsible for the allocation of the private context state. -+/// @param hContext - Handle passed back from SwrCreateContext -+VOID* SwrGetPrivateContextState( -+ HANDLE hContext) -+{ -+ SWR_CONTEXT* pContext = GetContext(hContext); -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ DRAW_STATE* pState = pDC->pState; -+ -+ if (pState->pPrivateState == nullptr) -+ { -+ pState->pPrivateState = pState->arena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); -+ } -+ -+ return pState->pPrivateState; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Clients can use this to allocate memory for draw/dispatch -+/// operations. The memory will automatically be freed once operation -+/// has completed. Client can use this to allocate binding tables, -+/// etc. needed for shader execution. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param size - Size of allocation -+/// @param align - Alignment needed for allocation. -+VOID* SwrAllocDrawContextMemory( -+ HANDLE hContext, -+ uint32_t size, -+ uint32_t align) -+{ -+ SWR_CONTEXT* pContext = GetContext(hContext); -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ -+ return pDC->pState->arena.AllocAligned(size, align); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Returns pointer to SWR stats. -+/// @note The counters are atomically incremented by multiple threads. -+/// When calling this, you need to ensure all previous operations -+/// have completed. -+/// @todo If necessary, add a callback to avoid stalling the pipe to -+/// sample the counters. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pStats - SWR will fill this out for caller. -+void SwrGetStats( -+ HANDLE hContext, -+ SWR_STATS* pStats) -+{ -+ SWR_CONTEXT *pContext = GetContext(hContext); -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ -+ pDC->inUse = true; -+ -+ pDC->FeWork.type = QUERYSTATS; -+ pDC->FeWork.pfnWork = ProcessQueryStats; -+ pDC->FeWork.desc.queryStats.pStats = pStats; -+ -+ // cannot execute until all previous draws have completed -+ pDC->dependency = pDC->drawId - 1; -+ -+ //enqueue -+ QueueDraw(pContext); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Enables stats counting -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param enable - If true then counts are incremented. -+void SwrEnableStats( -+ HANDLE hContext, -+ bool enable) -+{ -+ SWR_CONTEXT *pContext = GetContext(hContext); -+ DRAW_CONTEXT* pDC = GetDrawContext(pContext); -+ -+ pDC->pState->state.enableStats = enable; -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h -new file mode 100644 -index 0000000..1741ef6 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/api.h -@@ -0,0 +1,483 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file api.h -+* -+* @brief API definitions -+* -+******************************************************************************/ -+ -+#ifndef __SWR_API_H__ -+#define __SWR_API_H__ -+ -+#include "common/os.h" -+ -+#include -+#include -+ -+#include "common/simdintrin.h" -+#include "common/formats.h" -+#include "core/utils.h" -+#include "core/state.h" -+ -+///@todo place all the API functions into the 'swr' namespace. -+ -+typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Function signature for load hot tiles -+/// @param hPrivateContext - handle to private data -+/// @param dstFormat - format of the hot tile -+/// @param renderTargetIndex - render target to store, can be color, depth or stencil -+/// @param x - destination x coordinate -+/// @param y - destination y coordinate -+/// @param pDstHotTile - pointer to the hot tile surface -+typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Function signature for store hot tiles -+/// @param hPrivateContext - handle to private data -+/// @param srcFormat - format of the hot tile -+/// @param renderTargetIndex - render target to store, can be color, depth or stencil -+/// @param x - destination x coordinate -+/// @param y - destination y coordinate -+/// @param pSrcHotTile - pointer to the hot tile surface -+typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile); -+ -+/// @brief Function signature for clearing from the hot tiles clear value -+/// @param hPrivateContext - handle to private data -+/// @param renderTargetIndex - render target to store, can be color, depth or stencil -+/// @param x - destination x coordinate -+/// @param y - destination y coordinate -+/// @param pClearColor - pointer to the hot tile's clear value -+typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, -+ SWR_RENDERTARGET_ATTACHMENT rtIndex, -+ uint32_t x, uint32_t y, const float* pClearColor); -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_CREATECONTEXT_INFO -+///////////////////////////////////////////////////////////////////////// -+struct SWR_CREATECONTEXT_INFO -+{ -+ DRIVER_TYPE driver; -+ -+ // External functions (e.g. sampler) need per draw context state. -+ // Use SwrGetPrivateContextState() to access private state. -+ uint32_t privateStateSize; -+ -+ // tile manipulation functions -+ PFN_LOAD_TILE pfnLoadTile; -+ PFN_STORE_TILE pfnStoreTile; -+ PFN_CLEAR_TILE pfnClearTile; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_RECT -+///////////////////////////////////////////////////////////////////////// -+struct SWR_RECT -+{ -+ uint32_t left; -+ uint32_t right; -+ uint32_t top; -+ uint32_t bottom; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Create SWR Context. -+/// @param pCreateInfo - pointer to creation info. -+HANDLE SWR_API SwrCreateContext( -+ const SWR_CREATECONTEXT_INFO* pCreateInfo); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Destroys SWR Context. -+/// @param hContext - Handle passed back from SwrCreateContext -+void SWR_API SwrDestroyContext( -+ HANDLE hContext); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Sync cmd. Executes the callback func when all rendering up to this sync -+/// has been completed -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pfnFunc - pointer to callback function, -+/// @param userData - user data to pass back -+void SWR_API SwrSync( -+ HANDLE hContext, -+ PFN_CALLBACK_FUNC pfnFunc, -+ uint64_t userData, -+ uint64_t userData2); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Blocks until all rendering has been completed. -+/// @param hContext - Handle passed back from SwrCreateContext -+void SWR_API SwrWaitForIdle( -+ HANDLE hContext); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set vertex buffer state. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param numBuffers - Number of vertex buffer state descriptors. -+/// @param pVertexBuffers - Array of vertex buffer state descriptors. -+void SWR_API SwrSetVertexBuffers( -+ HANDLE hContext, -+ uint32_t numBuffers, -+ const SWR_VERTEX_BUFFER_STATE* pVertexBuffers); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set index buffer -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pIndexBuffer - Index buffer. -+void SWR_API SwrSetIndexBuffer( -+ HANDLE hContext, -+ const SWR_INDEX_BUFFER_STATE* pIndexBuffer); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set fetch shader pointer. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pfnFetchFunc - Pointer to shader. -+void SWR_API SwrSetFetchFunc( -+ HANDLE hContext, -+ PFN_FETCH_FUNC pfnFetchFunc); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set streamout shader pointer. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pfnSoFunc - Pointer to shader. -+/// @param streamIndex - specifies stream -+void SWR_API SwrSetSoFunc( -+ HANDLE hContext, -+ PFN_SO_FUNC pfnSoFunc, -+ uint32_t streamIndex); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set streamout state -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pSoState - Pointer to streamout state. -+void SWR_API SwrSetSoState( -+ HANDLE hContext, -+ SWR_STREAMOUT_STATE* pSoState); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set streamout buffer state -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pSoBuffer - Pointer to streamout buffer. -+/// @param slot - Slot to bind SO buffer to. -+void SWR_API SwrSetSoBuffers( -+ HANDLE hContext, -+ SWR_STREAMOUT_BUFFER* pSoBuffer, -+ uint32_t slot); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set vertex shader pointer. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pfnVertexFunc - Pointer to shader. -+void SWR_API SwrSetVertexFunc( -+ HANDLE hContext, -+ PFN_VERTEX_FUNC pfnVertexFunc); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set frontend state. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pState - Pointer to state -+void SWR_API SwrSetFrontendState( -+ HANDLE hContext, -+ SWR_FRONTEND_STATE *pState); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set geometry shader state. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pState - Pointer to state -+void SWR_API SwrSetGsState( -+ HANDLE hContext, -+ SWR_GS_STATE *pState); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set geometry shader -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pState - Pointer to geometry shader function -+void SWR_API SwrSetGsFunc( -+ HANDLE hContext, -+ PFN_GS_FUNC pfnGsFunc); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set compute shader -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pState - Pointer to compute shader function -+/// @param totalThreadsInGroup - product of thread group dimensions. -+void SWR_API SwrSetCsFunc( -+ HANDLE hContext, -+ PFN_CS_FUNC pfnCsFunc, -+ uint32_t totalThreadsInGroup); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set tessellation state. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pState - Pointer to state -+void SWR_API SwrSetTsState( -+ HANDLE hContext, -+ SWR_TS_STATE *pState); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set hull shader -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pfnFunc - Pointer to shader function -+void SWR_API SwrSetHsFunc( -+ HANDLE hContext, -+ PFN_HS_FUNC pfnFunc); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set domain shader -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pfnFunc - Pointer to shader function -+void SWR_API SwrSetDsFunc( -+ HANDLE hContext, -+ PFN_DS_FUNC pfnFunc); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set depth stencil state -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pState - Pointer to state. -+void SWR_API SwrSetDepthStencilState( -+ HANDLE hContext, -+ SWR_DEPTH_STENCIL_STATE *pState); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set backend state -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pState - Pointer to state. -+void SWR_API SwrSetBackendState( -+ HANDLE hContext, -+ SWR_BACKEND_STATE *pState); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set pixel shader state -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pState - Pointer to state. -+void SWR_API SwrSetPixelShaderState( -+ HANDLE hContext, -+ SWR_PS_STATE *pState); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set blend state -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pState - Pointer to state. -+void SWR_API SwrSetBlendState( -+ HANDLE hContext, -+ SWR_BLEND_STATE *pState); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set blend function -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param renderTarget - render target index -+/// @param pfnBlendFunc - function pointer -+void SWR_API SwrSetBlendFunc( -+ HANDLE hContext, -+ uint32_t renderTarget, -+ PFN_BLEND_JIT_FUNC pfnBlendFunc); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Set linkage mask -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param mask - Specifies which vertex outputs are are needed by PS. -+/// @param pMap - (Optional)Linkage map to specify where FE attributes are -+/// gathered from to supply PS attribute values. The length -+/// of the map buffer needs to match the number of set bits -+/// in "mask". -+void SWR_API SwrSetLinkage( -+ HANDLE hContext, -+ uint32_t mask, -+ const uint8_t* pMap); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrDraw -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param startVertex - Specifies start vertex in vertex buffer for draw. -+/// @param primCount - Number of vertices. -+void SWR_API SwrDraw( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t startVertex, -+ uint32_t primCount); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrDrawInstanced -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. -+/// @param numInstances - How many instances to render. -+/// @param startVertex - Specifies start vertex for draw. (vertex data) -+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -+void SWR_API SwrDrawInstanced( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t numVertsPerInstance, -+ uint32_t numInstances, -+ uint32_t startVertex, -+ uint32_t startInstance); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief DrawIndexed -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param numIndices - Number of indices to read sequentially from index buffer. -+/// @param indexOffset - Starting index into index buffer. -+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -+void SWR_API SwrDrawIndexed( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t numIndices, -+ uint32_t indexOffset, -+ int32_t baseVertex); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrDrawIndexedInstanced -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param topology - Specifies topology for draw. -+/// @param numIndices - Number of indices to read sequentially from index buffer. -+/// @param numInstances - Number of instances to render. -+/// @param indexOffset - Starting index into index buffer. -+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. -+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) -+void SWR_API SwrDrawIndexedInstanced( -+ HANDLE hContext, -+ PRIMITIVE_TOPOLOGY topology, -+ uint32_t numIndices, -+ uint32_t numInstances, -+ uint32_t indexOffset, -+ int32_t baseVertex, -+ uint32_t startInstance); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrInvalidateTiles -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. -+void SWR_API SwrInvalidateTiles( -+ HANDLE hContext, -+ uint32_t attachmentMask); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrDispatch -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param threadGroupCountX - Number of thread groups dispatched in X direction -+/// @param threadGroupCountY - Number of thread groups dispatched in Y direction -+/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction -+void SWR_API SwrDispatch( -+ HANDLE hContext, -+ uint32_t threadGroupCountX, -+ uint32_t threadGroupCountY, -+ uint32_t threadGroupCountZ); -+ -+ -+enum SWR_TILE_STATE -+{ -+ SWR_TILE_INVALID = 0, // tile is in unitialized state and should be loaded with surface contents before rendering -+ SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents -+ SWR_TILE_RESOLVED = 3, // is in sync with surface it represents -+}; -+ -+/// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs. -+void SWR_API SwrStoreTiles( -+ HANDLE hContext, -+ SWR_RENDERTARGET_ATTACHMENT attachment, -+ SWR_TILE_STATE postStoreTileState); -+ -+void SWR_API SwrClearRenderTarget( -+ HANDLE hContext, -+ uint32_t clearMask, -+ const FLOAT clearColor[4], -+ float z, -+ BYTE stencil); -+ -+void SWR_API SwrSetRastState( -+ HANDLE hContext, -+ const SWR_RASTSTATE *pRastState); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrSetViewports -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param numViewports - number of viewports passed in -+/// @param pViewports - Specifies extents of viewport. -+/// @param pMatrices - If not specified then SWR computes a default one. -+void SWR_API SwrSetViewports( -+ HANDLE hContext, -+ uint32_t numViewports, -+ const SWR_VIEWPORT* pViewports, -+ const SWR_VIEWPORT_MATRIX* pMatrices); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SwrSetScissorRects -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param numScissors - number of scissors passed in -+/// @param pScissors - array of scissors -+void SWR_API SwrSetScissorRects( -+ HANDLE hContext, -+ uint32_t numScissors, -+ const BBOX* pScissors); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Returns a pointer to the private context state for the current -+/// draw operation. This is used for external componets such as the -+/// sampler. -+/// -+/// @note Client needs to resend private state prior to each draw call. -+/// Also, SWR is responsible for the private state memory. -+/// @param hContext - Handle passed back from SwrCreateContext -+VOID* SWR_API SwrGetPrivateContextState( -+ HANDLE hContext); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Clients can use this to allocate memory for draw/dispatch -+/// operations. The memory will automatically be freed once operation -+/// has completed. Client can use this to allocate binding tables, -+/// etc. needed for shader execution. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param size - Size of allocation -+/// @param align - Alignment needed for allocation. -+VOID* SWR_API SwrAllocDrawContextMemory( -+ HANDLE hContext, -+ uint32_t size, -+ uint32_t align); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Returns pointer to SWR stats. -+/// @note The counters are incremented by multiple threads. -+/// When calling this, you need to ensure all previous operations -+/// have completed. -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param pStats - SWR will fill this out for caller. -+void SWR_API SwrGetStats( -+ HANDLE hContext, -+ SWR_STATS* pStats); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Enables stats counting -+/// @param hContext - Handle passed back from SwrCreateContext -+/// @param enable - If true then counts are incremented. -+void SWR_API SwrEnableStats( -+ HANDLE hContext, -+ bool enable); -+ -+#endif//__SWR_API_H__ -diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp -new file mode 100644 -index 0000000..bc4cfd8 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/arena.cpp -@@ -0,0 +1,126 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file arena.cpp -+* -+* @brief Arena memory manager -+* The arena is convenient and fast for managing allocations for any of -+* our allocations that are associated with operations and can all be freed -+* once when their operation has completed. Allocations are cheap since -+* most of the time its simply an increment of an offset. Also, no need to -+* free individual allocations. All of the arena memory can be freed at once. -+* -+******************************************************************************/ -+ -+#include "context.h" -+#include "arena.h" -+ -+#include -+ -+VOID Arena::Init() -+{ -+ m_memUsed = 0; -+ m_pCurBlock = nullptr; -+ m_pUsedBlocks = nullptr; -+} -+ -+VOID* Arena::AllocAligned(uint32_t size, uint32_t align) -+{ -+ if (m_pCurBlock) -+ { -+ ArenaBlock* pCurBlock = m_pCurBlock; -+ pCurBlock->offset = AlignUp(pCurBlock->offset, align); -+ -+ if ((pCurBlock->offset + size) < pCurBlock->blockSize) -+ { -+ BYTE* pMem = (BYTE*)pCurBlock->pMem + pCurBlock->offset; -+ pCurBlock->offset += size; -+ return pMem; -+ } -+ -+ // Not enough memory in this arena so lets move to a new block. -+ pCurBlock->pNext = m_pUsedBlocks; -+ m_pUsedBlocks = pCurBlock; -+ m_pCurBlock = nullptr; -+ } -+ -+ static const uint32_t ArenaBlockSize = 1024*1024; -+ uint32_t defaultBlockSize = ArenaBlockSize; -+ if (m_pUsedBlocks == nullptr) -+ { -+ // First allocation after reset. Let's make the first block be the total -+ // memory allocated during last set of allocations prior to reset. -+ defaultBlockSize = std::max(m_memUsed, defaultBlockSize); -+ m_memUsed = 0; -+ } -+ -+ uint32_t blockSize = std::max(size, defaultBlockSize); -+ blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4); -+ -+ VOID *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4); // Arena blocks are always simd byte aligned. -+ SWR_ASSERT(pMem != nullptr); -+ -+ m_pCurBlock = (ArenaBlock*)malloc(sizeof(ArenaBlock)); -+ SWR_ASSERT(m_pCurBlock != nullptr); -+ -+ if (m_pCurBlock != nullptr) -+ { -+ m_pCurBlock->pMem = pMem; -+ m_pCurBlock->blockSize = blockSize; -+ m_pCurBlock->offset = size; -+ m_memUsed += blockSize; -+ } -+ -+ return pMem; -+} -+ -+VOID* Arena::Alloc(uint32_t size) -+{ -+ return AllocAligned(size, 1); -+} -+ -+VOID Arena::Reset() -+{ -+ if (m_pCurBlock) -+ { -+ m_pCurBlock->offset = 0; -+ -+ // If we needed to allocate used blocks then reset current. -+ // The next time we allocate we'll grow the current block -+ // to match all the memory allocated this for this frame. -+ if (m_pUsedBlocks) -+ { -+ m_pCurBlock->pNext = m_pUsedBlocks; -+ m_pUsedBlocks = m_pCurBlock; -+ m_pCurBlock = nullptr; -+ } -+ } -+ -+ while(m_pUsedBlocks) -+ { -+ ArenaBlock* pBlock = m_pUsedBlocks; -+ m_pUsedBlocks = pBlock->pNext; -+ -+ _aligned_free(pBlock->pMem); -+ free(pBlock); -+ } -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h -new file mode 100644 -index 0000000..e98bc83 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h -@@ -0,0 +1,63 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file arena.h -+* -+* @brief Arena memory manager -+* The arena is convenient and fast for managing allocations for any of -+* our allocations that are associated with operations and can all be freed -+* once when their operation has completed. Allocations are cheap since -+* most of the time its simply an increment of an offset. Also, no need to -+* free individual allocations. All of the arena memory can be freed at once. -+* -+******************************************************************************/ -+#pragma once -+ -+class Arena -+{ -+public: -+ Arena() : m_pCurBlock(nullptr), m_pUsedBlocks(nullptr), m_memUsed(0) { } -+ ~Arena() { } -+ -+ VOID Init(); -+ -+ VOID* AllocAligned(uint32_t size, uint32_t align); -+ VOID* Alloc(uint32_t size); -+ VOID Reset(); -+ -+private: -+ -+ struct ArenaBlock -+ { -+ ArenaBlock() : pMem(nullptr), blockSize(0), pNext(nullptr) {} -+ -+ VOID *pMem; -+ uint32_t blockSize; -+ uint32_t offset; -+ ArenaBlock *pNext; -+ }; -+ -+ ArenaBlock *m_pCurBlock; -+ ArenaBlock *m_pUsedBlocks; -+ -+ uint32_t m_memUsed; // total bytes allocated since last reset. -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp -new file mode 100644 -index 0000000..9cf2b00 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp -@@ -0,0 +1,1150 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file backend.cpp -+* -+* @brief Backend handles rasterization, pixel shading and output merger -+* operations. -+* -+******************************************************************************/ -+ -+#include -+ -+#include "rdtsc_core.h" -+#include "backend.h" -+#include "depthstencil.h" -+#include "tilemgr.h" -+#include "memory/tilingtraits.h" -+#include "core/multisample.h" -+ -+#include -+ -+const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5}; -+const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5}; -+ -+/// @todo move to common lib -+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} -+static const __m128 gMaskToVec[] = { -+ MASKTOVEC(0,0,0,0), -+ MASKTOVEC(0,0,0,1), -+ MASKTOVEC(0,0,1,0), -+ MASKTOVEC(0,0,1,1), -+ MASKTOVEC(0,1,0,0), -+ MASKTOVEC(0,1,0,1), -+ MASKTOVEC(0,1,1,0), -+ MASKTOVEC(0,1,1,1), -+ MASKTOVEC(1,0,0,0), -+ MASKTOVEC(1,0,0,1), -+ MASKTOVEC(1,0,1,0), -+ MASKTOVEC(1,0,1,1), -+ MASKTOVEC(1,1,0,0), -+ MASKTOVEC(1,1,0,1), -+ MASKTOVEC(1,1,1,0), -+ MASKTOVEC(1,1,1,1), -+}; -+ -+typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]); -+static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Process compute work. -+/// @param pDC - pointer to draw context (dispatch). -+/// @param workerId - The unique worker ID that is assigned to this thread. -+/// @param threadGroupId - the linear index for the thread group within the dispatch. -+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId) -+{ -+ RDTSC_START(BEDispatch); -+ -+ SWR_CONTEXT *pContext = pDC->pContext; -+ -+ const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData(); -+ SWR_ASSERT(pTaskData != nullptr); -+ -+ const API_STATE& state = GetApiState(pDC); -+ -+ SWR_CS_CONTEXT csContext{ 0 }; -+ csContext.tileCounter = threadGroupId; -+ csContext.dispatchDims[0] = pTaskData->threadGroupCountX; -+ csContext.dispatchDims[1] = pTaskData->threadGroupCountY; -+ csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; -+ csContext.pTGSM = pContext->pScratch[workerId]; -+ -+ state.pfnCsFunc(GetPrivateState(pDC), &csContext); -+ -+ UPDATE_STAT(CsInvocations, state.totalThreadsInGroup); -+ -+ RDTSC_STOP(BEDispatch, 1, 0); -+} -+ -+void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) -+{ -+ SYNC_DESC *pSync = (SYNC_DESC*)pUserData; -+ -+ uint32_t x, y; -+ MacroTileMgr::getTileIndices(macroTile, x, y); -+ SWR_ASSERT(x == 0 && y == 0); -+ -+ if (pSync->pfnCallbackFunc != nullptr) -+ { -+ pSync->pfnCallbackFunc(pSync->userData, pSync->userData2); -+ } -+} -+ -+void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) -+{ -+ QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData; -+ SWR_STATS* pStats = pQueryDesc->pStats; -+ SWR_CONTEXT *pContext = pDC->pContext; -+ -+ SWR_ASSERT(pStats != nullptr); -+ -+ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) -+ { -+ pStats->DepthPassCount += pContext->stats[i].DepthPassCount; -+ -+ pStats->IaVertices += pContext->stats[i].IaVertices; -+ pStats->IaPrimitives += pContext->stats[i].IaPrimitives; -+ pStats->VsInvocations += pContext->stats[i].VsInvocations; -+ pStats->HsInvocations += pContext->stats[i].HsInvocations; -+ pStats->DsInvocations += pContext->stats[i].DsInvocations; -+ pStats->GsInvocations += pContext->stats[i].GsInvocations; -+ pStats->PsInvocations += pContext->stats[i].PsInvocations; -+ pStats->CInvocations += pContext->stats[i].CInvocations; -+ pStats->CsInvocations += pContext->stats[i].CsInvocations; -+ pStats->CPrimitives += pContext->stats[i].CPrimitives; -+ pStats->GsPrimitives += pContext->stats[i].GsPrimitives; -+ -+ for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) -+ { -+ pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream]; -+ -+ /// @note client is required to provide valid write offset before every draw, so we clear -+ /// out the contents of the write offset when storing stats -+ pContext->stats[i].SoWriteOffset[stream] = 0; -+ -+ pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream]; -+ pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream]; -+ } -+ } -+} -+ -+template -+void ClearRasterTile(BYTE *pTileBuffer, simdvector &value) -+{ -+ auto lambda = [&](int comp) -+ { -+ FormatTraits::storeSOA(comp, pTileBuffer, value.v[comp]); -+ pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits::GetBPC(comp) / 8); -+ }; -+ -+ const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM); -+ for (uint32_t i = 0; i < numIter; ++i) -+ { -+ UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); -+ } -+} -+ -+template -+INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4]) -+{ -+ // convert clear color to hottile format -+ // clear color is in RGBA float/uint32 -+ simdvector vClear; -+ for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) -+ { -+ simdscalar vComp; -+ vComp = _simd_load1_ps((const float*)&clear[comp]); -+ if (FormatTraits::isNormalized(comp)) -+ { -+ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::fromFloat(comp))); -+ vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); -+ } -+ vComp = FormatTraits::pack(comp, vComp); -+ vClear.v[FormatTraits::swizzle(comp)] = vComp; -+ } -+ -+ uint32_t tileX, tileY; -+ MacroTileMgr::getTileIndices(macroTile, tileX, tileY); -+ const API_STATE& state = GetApiState(pDC); -+ -+ int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY; -+ int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1; -+ int left = KNOB_MACROTILE_X_DIM_FIXED * tileX; -+ int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1; -+ -+ // intersect with scissor -+ top = std::max(top, state.scissorInFixedPoint.top); -+ left = std::max(left, state.scissorInFixedPoint.left); -+ bottom = std::min(bottom, state.scissorInFixedPoint.bottom); -+ right = std::min(right, state.scissorInFixedPoint.right); -+ -+ // translate to local hottile origin -+ top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY; -+ bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY; -+ left -= KNOB_MACROTILE_X_DIM_FIXED * tileX; -+ right -= KNOB_MACROTILE_X_DIM_FIXED * tileX; -+ -+ // convert to raster tiles -+ top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); -+ bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); -+ left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); -+ right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); -+ -+ const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); -+ // compute steps between raster tile samples / raster tiles / macro tile rows -+ const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8; -+ const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * numSamples; -+ const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep; -+ const uint32_t pitch = (FormatTraits::bpp * KNOB_MACROTILE_X_DIM / 8); -+ -+ HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples); -+ uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits::bpp > >(pitch, left, top)) * numSamples; -+ uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits::bpp > >(pitch, x, y)) * numSamples; -+ -+ // loop over all raster tiles in the current hot tile -+ for (int y = top; y <= bottom; ++y) -+ { -+ uint8_t* pRasterTile = pRasterTileRow; -+ for (int x = left; x <= right; ++x) -+ { -+ for( int sampleNum = 0; sampleNum < numSamples; sampleNum++) -+ { -+ ClearRasterTile(pRasterTile, vClear); -+ pRasterTile += rasterTileSampleStep; -+ } -+ } -+ pRasterTileRow += macroTileRowStep; -+ } -+ -+ pHotTile->state = HOTTILE_DIRTY; -+} -+ -+ -+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) -+{ -+ if (KNOB_FAST_CLEAR) -+ { -+ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; -+ SWR_CONTEXT *pContext = pDC->pContext; -+ SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; -+ uint32_t numSamples = GetNumSamples(sampleCount); -+ -+ SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason. -+ -+ RDTSC_START(BEClear); -+ -+ if (pClear->flags.mask & SWR_CLEAR_COLOR) -+ { -+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples); -+ // All we want to do here is to mark the hot tile as being in a "needs clear" state. -+ pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); -+ pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); -+ pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); -+ pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); -+ pHotTile->state = HOTTILE_CLEAR; -+ } -+ -+ if (pClear->flags.mask & SWR_CLEAR_DEPTH) -+ { -+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples); -+ pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth; -+ pHotTile->state = HOTTILE_CLEAR; -+ } -+ -+ if (pClear->flags.mask & SWR_CLEAR_STENCIL) -+ { -+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples); -+ -+ pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil; -+ pHotTile->state = HOTTILE_CLEAR; -+ } -+ -+ RDTSC_STOP(BEClear, 0, 0); -+ } -+ else -+ { -+ // Legacy clear -+ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; -+ RDTSC_START(BEClear); -+ -+ if (pClear->flags.mask & SWR_CLEAR_COLOR) -+ { -+ /// @todo clear data should come in as RGBA32_FLOAT -+ DWORD clearData[4]; -+ float clearFloat[4]; -+ clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f; -+ clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f; -+ clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f; -+ clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f; -+ clearData[0] = *(DWORD*)&clearFloat[0]; -+ clearData[1] = *(DWORD*)&clearFloat[1]; -+ clearData[2] = *(DWORD*)&clearFloat[2]; -+ clearData[3] = *(DWORD*)&clearFloat[3]; -+ -+ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; -+ SWR_ASSERT(pfnClearTiles != nullptr); -+ -+ pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData); -+ } -+ -+ if (pClear->flags.mask & SWR_CLEAR_DEPTH) -+ { -+ DWORD clearData[4]; -+ clearData[0] = *(DWORD*)&pClear->clearDepth; -+ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; -+ SWR_ASSERT(pfnClearTiles != nullptr); -+ -+ pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData); -+ } -+ -+ if (pClear->flags.mask & SWR_CLEAR_STENCIL) -+ { -+ uint32_t value = pClear->clearStencil; -+ DWORD clearData[4]; -+ clearData[0] = *(DWORD*)&value; -+ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; -+ -+ pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData); -+ } -+ -+ RDTSC_STOP(BEClear, 0, 0); -+ } -+} -+ -+ -+void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) -+{ -+ RDTSC_START(BEStoreTiles); -+ STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData; -+ SWR_CONTEXT *pContext = pDC->pContext; -+ -+#ifdef KNOB_ENABLE_RDTSC -+ uint32_t numTiles = 0; -+#endif -+ SWR_FORMAT srcFormat; -+ switch (pDesc->attachment) -+ { -+ case SWR_ATTACHMENT_COLOR0: -+ case SWR_ATTACHMENT_COLOR1: -+ case SWR_ATTACHMENT_COLOR2: -+ case SWR_ATTACHMENT_COLOR3: -+ case SWR_ATTACHMENT_COLOR4: -+ case SWR_ATTACHMENT_COLOR5: -+ case SWR_ATTACHMENT_COLOR6: -+ case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; -+ case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break; -+ case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break; -+ default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; -+ } -+ -+ uint32_t x, y; -+ MacroTileMgr::getTileIndices(macroTile, x, y); -+ -+ // Only need to store the hottile if it's been rendered to... -+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false); -+ if (pHotTile) -+ { -+ // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. -+ if (pHotTile->state == HOTTILE_CLEAR) -+ { -+ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat]; -+ SWR_ASSERT(pfnClearTiles != nullptr); -+ -+ pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData); -+ } -+ -+ if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) -+ { -+ int destX = KNOB_MACROTILE_X_DIM * x; -+ int destY = KNOB_MACROTILE_Y_DIM * y; -+ -+ pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat, -+ pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); -+ } -+ -+ -+ if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) -+ { -+ pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; -+ } -+ } -+ RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId); -+} -+ -+ -+void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) -+{ -+ INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData; -+ SWR_CONTEXT *pContext = pDC->pContext; -+ -+ for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i) -+ { -+ if (pDesc->attachmentMask & (1 << i)) -+ { -+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false); -+ if (pHotTile) -+ { -+ pHotTile->state = HOTTILE_INVALID; -+ } -+ } -+ } -+} -+ -+#if KNOB_SIMD_WIDTH == 8 -+const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 }; -+const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 }; -+const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; -+const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; -+#define MASK 0xff -+#else -+#error Unsupported vector width -+#endif -+ -+INLINE -+bool CanEarlyZ(const SWR_PS_STATE *pPSState) -+{ -+ return (!pPSState->writesODepth && !pPSState->usesSourceDepth); -+} -+ -+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ) -+{ -+ simdscalar vClipMask = _simd_setzero_ps(); -+ uint32_t numClipDistance = _mm_popcnt_u32(clipMask); -+ -+ for (uint32_t i = 0; i < numClipDistance; ++i) -+ { -+ // pull triangle clip distance values from clip buffer -+ simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++); -+ simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++); -+ simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++); -+ -+ // interpolate -+ simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ); -+ -+ // clip if interpolated clip distance is < 0 || NAN -+ simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ); -+ -+ vClipMask = _simd_or_ps(vClipMask, vCull); -+ } -+ -+ return _simd_movemask_ps(vClipMask); -+} -+ -+template -+void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) -+{ -+ RDTSC_START(BESetup); -+ -+ SWR_CONTEXT *pContext = pDC->pContext; -+ const API_STATE& state = GetApiState(pDC); -+ const SWR_RASTSTATE& rastState = state.rastState; -+ const SWR_PS_STATE *pPSState = &state.psState; -+ const SWR_BLEND_STATE *pBlendState = &state.blendState; -+ -+ // broadcast scalars -+ simdscalar vIa = _simd_broadcast_ss(&work.I[0]); -+ simdscalar vIb = _simd_broadcast_ss(&work.I[1]); -+ simdscalar vIc = _simd_broadcast_ss(&work.I[2]); -+ -+ simdscalar vJa = _simd_broadcast_ss(&work.J[0]); -+ simdscalar vJb = _simd_broadcast_ss(&work.J[1]); -+ simdscalar vJc = _simd_broadcast_ss(&work.J[2]); -+ -+ simdscalar vZa = _simd_broadcast_ss(&work.Z[0]); -+ simdscalar vZb = _simd_broadcast_ss(&work.Z[1]); -+ simdscalar vZc = _simd_broadcast_ss(&work.Z[2]); -+ -+ simdscalar vRecipDet = _simd_broadcast_ss(&work.recipDet); -+ -+ simdscalar vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); -+ simdscalar vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); -+ simdscalar vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); -+ -+ uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; -+ for(uint32_t rt = 0; rt <= MaxRT; ++rt) -+ { -+ pColorBase[rt] = renderBuffers.pColor[rt]; -+ } -+ uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; -+ RDTSC_STOP(BESetup, 0, 0); -+ -+ SWR_PS_CONTEXT psContext; -+ psContext.pAttribs = work.pAttribs; -+ psContext.pPerspAttribs = work.pPerspAttribs; -+ psContext.frontFace = work.triFlags.frontFacing; -+ psContext.primID = work.triFlags.primID; -+ -+ // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs -+ psContext.I = work.I; -+ psContext.J = work.J; -+ psContext.recipDet = work.recipDet; -+ psContext.pSamplePos = work.pSamplePos; -+ const uint32_t numSamples = MultisampleTraits::numSamples; -+ -+ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) -+ { -+ simdscalar vYSamplePosUL; -+ if(sampleCount == SWR_MULTISAMPLE_1X) -+ { -+ // pixel center -+ psContext.vY = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); -+ } -+ else -+ { -+ // UL pixel corner -+ vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); -+ } -+ -+ for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) -+ { -+ simdscalar vXSamplePosUL; -+ if(sampleCount > SWR_MULTISAMPLE_1X) -+ { -+ // UL pixel corner -+ vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); -+ } -+ -+ // @todo: uint32_t sampleMask = state.rastState.sampleMask & MultisampleTraits::sampleMask; -+ for(uint32_t sample = 0; sample < numSamples; sample++) -+ { -+ /// @todo: sampleMask / inputcoverage -+ if (work.coverageMask[sample] & MASK) -+ { -+ RDTSC_START(BEBarycentric); -+ -+ if(sampleCount == SWR_MULTISAMPLE_1X) -+ { -+ // pixel center -+ psContext.vX = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); -+ } -+ else -+ { -+ // calculate per sample positions -+ psContext.vX = _simd_add_ps(vXSamplePosUL, MultisampleTraits::vX(sample)); -+ psContext.vY = _simd_add_ps(vYSamplePosUL, MultisampleTraits::vY(sample)); -+ } -+ -+ // evaluate I,J -+ psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY); -+ psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY); -+ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); -+ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); -+ -+ // interpolate z -+ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); -+ RDTSC_STOP(BEBarycentric, 0, 0); -+ -+ simdmask coverageMask = work.coverageMask[sample] & MASK; -+ -+ // interpolate user clip distance if available -+ if (rastState.clipDistanceMask) -+ { -+ coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, -+ psContext.vI, psContext.vJ); -+ } -+ -+ simdscalar depthPassMask = vMask(coverageMask); -+ -+ uint8_t *pDepthSample, *pStencilSample; -+ if(sampleCount == SWR_MULTISAMPLE_1X) -+ { -+ pDepthSample = pDepthBase; -+ pStencilSample = pStencilBase; -+ } -+ else -+ { -+ // offset depth/stencil buffers current sample -+ pDepthSample = pDepthBase + MultisampleTraits::RasterTileDepthOffset(sample); -+ pStencilSample = pStencilBase + MultisampleTraits::RasterTileStencilOffset(sample); -+ } -+ -+ // Early-Z? -+ if (CanEarlyZ(pPSState)) -+ { -+ RDTSC_START(BEEarlyDepthTest); -+ depthPassMask = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, -+ psContext.vZ, pDepthBase, depthPassMask, pStencilBase, pPSState->killsPixel); -+ RDTSC_STOP(BEEarlyDepthTest, 0, 0); -+ -+ if (!_simd_movemask_ps(depthPassMask)) -+ { -+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); -+ continue; -+ } -+ } -+ -+ // interpolate 1/w -+ psContext.vOneOverW = vplaneps(vAOneOverW, vBOneOverW, vCOneOverW, psContext.vI, psContext.vJ); -+ psContext.sampleIndex = sample; -+ psContext.mask = _simd_castps_si(depthPassMask); -+ -+ // execute pixel shader -+ RDTSC_START(BEPixelShader); -+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); -+ RDTSC_STOP(BEPixelShader, 0, 0); -+ -+ depthPassMask = _simd_castsi_ps(psContext.mask); -+ -+ //// late-Z -+ if (!CanEarlyZ(pPSState) || pPSState->killsPixel) -+ { -+ RDTSC_START(BELateDepthTest); -+ depthPassMask = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, -+ psContext.vZ, pDepthSample, depthPassMask, pStencilSample, false); -+ RDTSC_STOP(BELateDepthTest, 0, 0); -+ -+ if (!_simd_movemask_ps(depthPassMask)) -+ { -+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); -+ continue; -+ } -+ } -+ -+ uint32_t statMask = _simd_movemask_ps(depthPassMask); -+ uint32_t statCount = _mm_popcnt_u32(statMask); -+ UPDATE_STAT(DepthPassCount, statCount); -+ -+ simdscalari mask = _simd_castps_si(depthPassMask); -+ -+ // output merger -+ RDTSC_START(BEOutputMerger); -+ -+ if(sampleCount != SWR_MULTISAMPLE_1X) -+ { -+ if(rastState.isSampleMasked[sample]) -+ { -+ continue; -+ } -+ } -+ -+ uint32_t rasterTileColorOffset = MultisampleTraits::RasterTileColorOffset(sample); -+ for (uint32_t rt = 0; rt <= MaxRT; ++rt) -+ { -+ uint8_t *pColorSample; -+ if(sampleCount == SWR_MULTISAMPLE_1X) -+ { -+ pColorSample = pColorBase[rt]; -+ } -+ else -+ { -+ pColorSample = pColorBase[rt] + rasterTileColorOffset; -+ } -+ -+ const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; -+ -+ // Blend outputs -+ if (pRTBlend->colorBlendEnable) -+ { -+ state.pfnBlendFunc[rt](pBlendState, psContext.shaded[rt], psContext.shaded[1], pColorSample, psContext.shaded[rt]); -+ } -+ -+ ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. -+ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); -+ -+ const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); -+ -+ // store with color mask -+ if (!pRTBlend->writeDisableRed) -+ { -+ _simd_maskstore_ps((float*)pColorSample, mask, psContext.shaded[rt].x); -+ } -+ if (!pRTBlend->writeDisableGreen) -+ { -+ _simd_maskstore_ps((float*)(pColorSample + simd), mask, psContext.shaded[rt].y); -+ } -+ if (!pRTBlend->writeDisableBlue) -+ { -+ _simd_maskstore_ps((float*)(pColorSample + simd * 2), mask, psContext.shaded[rt].z); -+ } -+ if (!pRTBlend->writeDisableAlpha) -+ { -+ _simd_maskstore_ps((float*)(pColorSample + simd * 3), mask, psContext.shaded[rt].w); -+ } -+ } -+ -+ RDTSC_STOP(BEOutputMerger, 0, 0); -+ } -+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); -+ } -+ RDTSC_START(BEEndTile); -+ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; -+ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; -+ -+ for (uint32_t rt = 0; rt <= MaxRT; ++rt) -+ { -+ pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; -+ } -+ RDTSC_STOP(BEEndTile, 0, 0); -+ } -+ } -+} -+ -+template -+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) -+{ -+ RDTSC_START(BESetup); -+ -+ SWR_CONTEXT *pContext = pDC->pContext; -+ const API_STATE& state = GetApiState(pDC); -+ const SWR_RASTSTATE& rastState = state.rastState; -+ const SWR_PS_STATE *pPSState = &state.psState; -+ const SWR_BLEND_STATE *pBlendState = &state.blendState; -+ -+ // broadcast scalars -+ simdscalar vIa = _simd_broadcast_ss(&work.I[0]); -+ simdscalar vIb = _simd_broadcast_ss(&work.I[1]); -+ simdscalar vIc = _simd_broadcast_ss(&work.I[2]); -+ -+ simdscalar vJa = _simd_broadcast_ss(&work.J[0]); -+ simdscalar vJb = _simd_broadcast_ss(&work.J[1]); -+ simdscalar vJc = _simd_broadcast_ss(&work.J[2]); -+ -+ simdscalar vZa = _simd_broadcast_ss(&work.Z[0]); -+ simdscalar vZb = _simd_broadcast_ss(&work.Z[1]); -+ simdscalar vZc = _simd_broadcast_ss(&work.Z[2]); -+ -+ simdscalar vRecipDet = _simd_broadcast_ss(&work.recipDet); -+ -+ simdscalar vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); -+ simdscalar vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); -+ simdscalar vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); -+ -+ uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; -+ for(uint32_t rt = 0; rt <= MaxRT; ++rt) -+ { -+ pColorBase[rt] = renderBuffers.pColor[rt]; -+ } -+ uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; -+ RDTSC_STOP(BESetup, 0, 0); -+ -+ SWR_PS_CONTEXT psContext; -+ psContext.pAttribs = work.pAttribs; -+ psContext.pPerspAttribs = work.pPerspAttribs; -+ psContext.frontFace = work.triFlags.frontFacing; -+ psContext.primID = work.triFlags.primID; -+ -+ // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs -+ psContext.I = work.I; -+ psContext.J = work.J; -+ psContext.recipDet = work.recipDet; -+ psContext.pSamplePos = work.pSamplePos; -+ psContext.sampleIndex = 0; -+ -+ const uint32_t numSamples = MultisampleTraits::numSamples; -+ for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) -+ { -+ simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); -+ simdscalar vYSamplePosCenter = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); -+ for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) -+ { -+ simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); -+ simdscalar vXSamplePosCenter = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); -+ -+ // if oDepth written to, or there is a potential to discard any samples, we need to -+ // run the PS early, then interp or broadcast Z and test -+ if(pPSState->writesODepth || pPSState->killsPixel) -+ { -+ RDTSC_START(BEBarycentric); -+ // set pixel center positions -+ psContext.vX = vXSamplePosCenter; -+ psContext.vY = vYSamplePosCenter; -+ -+ // evaluate I, J at pixel center -+ psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY); -+ psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY); -+ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); -+ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); -+ -+ // interpolate z -+ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); -+ -+ RDTSC_STOP(BEBarycentric, 0, 0); -+ -+ // interpolate 1/w -+ psContext.vOneOverW = vplaneps(vAOneOverW, vBOneOverW, vCOneOverW, psContext.vI, psContext.vJ); -+ -+ /// @todo: sampleMask / inputcoverage -+ // for now just pass in all 1s -+ psContext.mask = _simd_set1_epi32(-1); -+ -+ // execute pixel shader -+ RDTSC_START(BEPixelShader); -+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); -+ RDTSC_STOP(BEPixelShader, 0, 0); -+ } -+ else -+ { -+ /// @todo: sampleMask / inputcoverage -+ // for now just through full pixel output -+ psContext.mask = _simd_set1_epi32(-1); -+ } -+ -+ simdscalar depthPassMask[numSamples]; -+ simdscalar anyDepthSamplePassed = _simd_setzero_ps(); -+ for(uint32_t sample = 0; sample < numSamples; sample++) -+ { -+ /// @todo: sampleMask / inputcoverage -+ depthPassMask[sample] = vMask(work.coverageMask[sample] & MASK); -+ // pull mask back out for any discards and and with coverage -+ depthPassMask[sample] = _simd_and_ps(depthPassMask[sample], _simd_castsi_ps(psContext.mask)); -+ -+ if (!_simd_movemask_ps(depthPassMask[sample])) -+ { -+ depthPassMask[sample] = _simd_setzero_ps(); -+ continue; -+ } -+ -+ // if oDepth isn't written to, we need to interpolate Z for each sample -+ // if clip distances are enabled, we need to interpolate for each sample -+ if(!pPSState->writesODepth || rastState.clipDistanceMask) -+ { -+ RDTSC_START(BEBarycentric); -+ // calculate per sample positions -+ simdscalar vSamplePosX = _simd_add_ps(vXSamplePosUL, MultisampleTraits::vX(sample)); -+ simdscalar vSamplePosY = _simd_add_ps(vYSamplePosUL, MultisampleTraits::vY(sample)); -+ -+ // evaluate I,J at sample positions -+ psContext.vI = vplaneps(vIa, vIb, vIc, vSamplePosX, vSamplePosY); -+ psContext.vJ = vplaneps(vJa, vJb, vJc, vSamplePosX, vSamplePosY); -+ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); -+ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); -+ -+ // interpolate z -+ if (!pPSState->writesODepth) -+ { -+ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); -+ } -+ -+ // interpolate clip distances -+ if (rastState.clipDistanceMask) -+ { -+ uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, -+ psContext.vI, psContext.vJ); -+ depthPassMask[sample] = _simd_and_ps(depthPassMask[sample], vMask(~clipMask)); -+ } -+ RDTSC_STOP(BEBarycentric, 0, 0); -+ } -+ // else 'broadcast' and test psContext.vZ from the PS invocation for each sample -+ -+ // offset depth/stencil buffers current sample -+ uint8_t *pDepthSample = pDepthBase + MultisampleTraits::RasterTileDepthOffset(sample); -+ uint8_t * pStencilSample = pStencilBase + MultisampleTraits::RasterTileStencilOffset(sample); -+ -+ // ZTest for this sample -+ RDTSC_START(BEEarlyDepthTest); -+ depthPassMask[sample] = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, -+ psContext.vZ, pDepthSample, depthPassMask[sample], pStencilSample, false); -+ RDTSC_STOP(BEEarlyDepthTest, 0, 0); -+ -+ anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); -+ -+ uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); -+ uint32_t statCount = _mm_popcnt_u32(statMask); -+ UPDATE_STAT(DepthPassCount, statCount); -+ } -+ -+ // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS -+ if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed)) -+ { -+ RDTSC_START(BEBarycentric); -+ // set pixel center positions -+ psContext.vX = vXSamplePosCenter; -+ psContext.vY = vYSamplePosCenter; -+ -+ // evaluate I,J at pixel center -+ psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY); -+ psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY); -+ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); -+ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); -+ -+ // interpolate z -+ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); -+ RDTSC_STOP(BEBarycentric, 0, 0); -+ -+ // interpolate 1/w -+ psContext.vOneOverW = vplaneps(vAOneOverW, vBOneOverW, vCOneOverW, psContext.vI, psContext.vJ); -+ -+ // execute pixel shader -+ RDTSC_START(BEPixelShader); -+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); -+ RDTSC_STOP(BEPixelShader, 0, 0); -+ } -+ else -+ { -+ goto Endtile; -+ } -+ -+ // loop over all samples, broadcasting the results of the PS to all passing pixels -+ for(uint32_t sample = 0; sample < numSamples; sample++) -+ { -+ if(sampleCount != SWR_MULTISAMPLE_1X) -+ { -+ if(rastState.isSampleMasked[sample]) -+ continue; -+ } -+ -+ // output merger -+ RDTSC_START(BEOutputMerger); -+ // skip if none of the pixels for this sample passed -+ if(!_simd_movemask_ps(depthPassMask[sample])) -+ { -+ depthPassMask[sample] = _simd_setzero_ps(); -+ continue; -+ } -+ simdscalari mask = _simd_castps_si(depthPassMask[sample]); -+ uint32_t rasterTileColorOffset = MultisampleTraits::RasterTileColorOffset(sample); -+ for(uint32_t rt = 0; rt <= MaxRT; ++rt) -+ { -+ uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; -+ -+ const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; -+ -+ // Blend outputs -+ if(pRTBlend->colorBlendEnable) -+ { -+ state.pfnBlendFunc[rt](pBlendState, psContext.shaded[rt], psContext.shaded[1], pColorSample, psContext.shaded[rt]); -+ } -+ -+ ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. -+ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); -+ -+ const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); -+ -+ // store with color mask -+ if(!pRTBlend->writeDisableRed) -+ { -+ _simd_maskstore_ps((float*)pColorSample, mask, psContext.shaded[rt].x); -+ } -+ if(!pRTBlend->writeDisableGreen) -+ { -+ _simd_maskstore_ps((float*)(pColorSample + simd), mask, psContext.shaded[rt].y); -+ } -+ if(!pRTBlend->writeDisableBlue) -+ { -+ _simd_maskstore_ps((float*)(pColorSample + simd * 2), mask, psContext.shaded[rt].z); -+ } -+ if(!pRTBlend->writeDisableAlpha) -+ { -+ _simd_maskstore_ps((float*)(pColorSample + simd * 3), mask, psContext.shaded[rt].w); -+ } -+ } -+ RDTSC_STOP(BEOutputMerger, 0, 0); -+ } -+ -+Endtile: -+ RDTSC_START(BEEndTile); -+ for(uint32_t sample = 0; sample < numSamples; sample++) -+ { -+ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); -+ } -+ -+ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; -+ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; -+ -+ for(uint32_t rt = 0; rt <= MaxRT; ++rt) -+ { -+ pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; -+ } -+ RDTSC_STOP(BEEndTile, 0, 0); -+ } -+ } -+} -+// optimized backend flow with NULL PS -+void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) -+{ -+ RDTSC_START(BESetup); -+ -+ SWR_CONTEXT *pContext = pDC->pContext; -+ const API_STATE& state = GetApiState(pDC); -+ // todo multisample -+ uint64_t coverageMask = work.coverageMask[0]; -+ -+ // broadcast scalars -+ simdscalar vIa = _simd_broadcast_ss(&work.I[0]); -+ simdscalar vIb = _simd_broadcast_ss(&work.I[1]); -+ simdscalar vIc = _simd_broadcast_ss(&work.I[2]); -+ -+ simdscalar vJa = _simd_broadcast_ss(&work.J[0]); -+ simdscalar vJb = _simd_broadcast_ss(&work.J[1]); -+ simdscalar vJc = _simd_broadcast_ss(&work.J[2]); -+ -+ simdscalar vZa = _simd_broadcast_ss(&work.Z[0]); -+ simdscalar vZb = _simd_broadcast_ss(&work.Z[1]); -+ simdscalar vZc = _simd_broadcast_ss(&work.Z[2]); -+ -+ simdscalar vRecipDet = _simd_broadcast_ss(&work.recipDet); -+ -+ BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; -+ -+ RDTSC_STOP(BESetup, 0, 0); -+ -+ SWR_PS_CONTEXT psContext; -+ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) -+ { -+ psContext.vY = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); -+ for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) -+ { -+ if (coverageMask & MASK) -+ { -+ RDTSC_START(BEBarycentric); -+ -+ // calculate pixel positions -+ psContext.vX = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); -+ -+ // evaluate I,J -+ psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY); -+ psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY); -+ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); -+ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); -+ -+ // interpolate z -+ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); -+ -+ RDTSC_STOP(BEBarycentric, 0, 0); -+ -+ simdscalar depthPassMask = vMask(coverageMask & MASK); -+ RDTSC_START(BEEarlyDepthTest); -+ depthPassMask = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, -+ psContext.vZ, pDepthBase, depthPassMask, pStencilBase, false); -+ RDTSC_STOP(BEEarlyDepthTest, 0, 0); -+ -+ uint32_t statMask = _simd_movemask_ps(depthPassMask); -+ uint32_t statCount = _mm_popcnt_u32(statMask); -+ UPDATE_STAT(DepthPassCount, statCount); -+ } -+ coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); -+ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; -+ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; -+ } -+ } -+} -+ -+void InitClearTilesTable() -+{ -+ memset(sClearTilesTable, 0, sizeof(sClearTilesTable)); -+ -+ sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile; -+ sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile; -+ sClearTilesTable[R32_FLOAT] = ClearMacroTile; -+ sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile; -+ sClearTilesTable[R8_UINT] = ClearMacroTile; -+} -+ -+// initialize backend function tables -+PFN_BACKEND_FUNC gSingleSampleBackendTable[] = { -+ BackendSampleRate<0, SWR_MULTISAMPLE_1X>, -+ BackendSampleRate<1, SWR_MULTISAMPLE_1X>, -+ BackendSampleRate<2, SWR_MULTISAMPLE_1X>, -+ BackendSampleRate<3, SWR_MULTISAMPLE_1X>, -+ BackendSampleRate<4, SWR_MULTISAMPLE_1X>, -+ BackendSampleRate<5, SWR_MULTISAMPLE_1X>, -+ BackendSampleRate<6, SWR_MULTISAMPLE_1X>, -+ BackendSampleRate<7, SWR_MULTISAMPLE_1X>, -+}; -+ -+// MSAA per sample shading rate -+PFN_BACKEND_FUNC gSampleRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS] ={ -+ { -+ BackendSampleRate<0, SWR_MULTISAMPLE_2X>, -+ BackendSampleRate<1, SWR_MULTISAMPLE_2X>, -+ BackendSampleRate<2, SWR_MULTISAMPLE_2X>, -+ BackendSampleRate<3, SWR_MULTISAMPLE_2X>, -+ BackendSampleRate<4, SWR_MULTISAMPLE_2X>, -+ BackendSampleRate<5, SWR_MULTISAMPLE_2X>, -+ BackendSampleRate<6, SWR_MULTISAMPLE_2X>, -+ BackendSampleRate<7, SWR_MULTISAMPLE_2X>, -+ }, -+ { -+ BackendSampleRate<0, SWR_MULTISAMPLE_4X>, -+ BackendSampleRate<1, SWR_MULTISAMPLE_4X>, -+ BackendSampleRate<2, SWR_MULTISAMPLE_4X>, -+ BackendSampleRate<3, SWR_MULTISAMPLE_4X>, -+ BackendSampleRate<4, SWR_MULTISAMPLE_4X>, -+ BackendSampleRate<5, SWR_MULTISAMPLE_4X>, -+ BackendSampleRate<6, SWR_MULTISAMPLE_4X>, -+ BackendSampleRate<7, SWR_MULTISAMPLE_4X>, -+ }, -+ { -+ BackendSampleRate<0, SWR_MULTISAMPLE_8X>, -+ BackendSampleRate<1, SWR_MULTISAMPLE_8X>, -+ BackendSampleRate<2, SWR_MULTISAMPLE_8X>, -+ BackendSampleRate<3, SWR_MULTISAMPLE_8X>, -+ BackendSampleRate<4, SWR_MULTISAMPLE_8X>, -+ BackendSampleRate<5, SWR_MULTISAMPLE_8X>, -+ BackendSampleRate<6, SWR_MULTISAMPLE_8X>, -+ BackendSampleRate<7, SWR_MULTISAMPLE_8X>, -+ }, -+ { -+ BackendSampleRate<0, SWR_MULTISAMPLE_16X>, -+ BackendSampleRate<1, SWR_MULTISAMPLE_16X>, -+ BackendSampleRate<2, SWR_MULTISAMPLE_16X>, -+ BackendSampleRate<3, SWR_MULTISAMPLE_16X>, -+ BackendSampleRate<4, SWR_MULTISAMPLE_16X>, -+ BackendSampleRate<5, SWR_MULTISAMPLE_16X>, -+ BackendSampleRate<6, SWR_MULTISAMPLE_16X>, -+ BackendSampleRate<7, SWR_MULTISAMPLE_16X>, -+ } -+}; -+ -+// MSAA per pixel shading rate -+PFN_BACKEND_FUNC gPixelRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS] ={ -+ { -+ BackendPixelRate<0, SWR_MULTISAMPLE_2X>, -+ BackendPixelRate<1, SWR_MULTISAMPLE_2X>, -+ BackendPixelRate<2, SWR_MULTISAMPLE_2X>, -+ BackendPixelRate<3, SWR_MULTISAMPLE_2X>, -+ BackendPixelRate<4, SWR_MULTISAMPLE_2X>, -+ BackendPixelRate<5, SWR_MULTISAMPLE_2X>, -+ BackendPixelRate<6, SWR_MULTISAMPLE_2X>, -+ BackendPixelRate<7, SWR_MULTISAMPLE_2X>, -+ }, -+ { -+ BackendPixelRate<0, SWR_MULTISAMPLE_4X>, -+ BackendPixelRate<1, SWR_MULTISAMPLE_4X>, -+ BackendPixelRate<2, SWR_MULTISAMPLE_4X>, -+ BackendPixelRate<3, SWR_MULTISAMPLE_4X>, -+ BackendPixelRate<4, SWR_MULTISAMPLE_4X>, -+ BackendPixelRate<5, SWR_MULTISAMPLE_4X>, -+ BackendPixelRate<6, SWR_MULTISAMPLE_4X>, -+ BackendPixelRate<7, SWR_MULTISAMPLE_4X>, -+ }, -+ { -+ BackendPixelRate<0, SWR_MULTISAMPLE_8X>, -+ BackendPixelRate<1, SWR_MULTISAMPLE_8X>, -+ BackendPixelRate<2, SWR_MULTISAMPLE_8X>, -+ BackendPixelRate<3, SWR_MULTISAMPLE_8X>, -+ BackendPixelRate<4, SWR_MULTISAMPLE_8X>, -+ BackendPixelRate<5, SWR_MULTISAMPLE_8X>, -+ BackendPixelRate<6, SWR_MULTISAMPLE_8X>, -+ BackendPixelRate<7, SWR_MULTISAMPLE_8X>, -+ }, -+ { -+ BackendPixelRate<0, SWR_MULTISAMPLE_16X>, -+ BackendPixelRate<1, SWR_MULTISAMPLE_16X>, -+ BackendPixelRate<2, SWR_MULTISAMPLE_16X>, -+ BackendPixelRate<3, SWR_MULTISAMPLE_16X>, -+ BackendPixelRate<4, SWR_MULTISAMPLE_16X>, -+ BackendPixelRate<5, SWR_MULTISAMPLE_16X>, -+ BackendPixelRate<6, SWR_MULTISAMPLE_16X>, -+ BackendPixelRate<7, SWR_MULTISAMPLE_16X>, -+ } -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h -new file mode 100644 -index 0000000..218f5c0 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h -@@ -0,0 +1,45 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file backend.h -+* -+* @brief Backend handles rasterization, pixel shading and output merger -+* operations. -+* -+******************************************************************************/ -+#pragma once -+ -+#include "common/os.h" -+#include "core/context.h" -+ -+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId); -+void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); -+void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); -+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); -+void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -+void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -+void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers); -+void InitClearTilesTable(); -+ -+extern PFN_BACKEND_FUNC gSingleSampleBackendTable[]; -+extern PFN_BACKEND_FUNC gSampleRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS]; -+extern PFN_BACKEND_FUNC gPixelRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS]; -diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h -new file mode 100644 -index 0000000..626c237 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/blend.h -@@ -0,0 +1,318 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file blend.cpp -+* -+* @brief Implementation for blending operations. -+* -+******************************************************************************/ -+#include "state.h" -+ -+template -+INLINE -+void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdvector &src, simdvector &src1, simdvector &dst, simdvector &out) -+{ -+ simdvector result; -+ -+ switch (func) -+ { -+ case BLENDFACTOR_ZERO: -+ result.x = _simd_setzero_ps(); -+ result.y = _simd_setzero_ps(); -+ result.z = _simd_setzero_ps(); -+ result.w = _simd_setzero_ps(); -+ break; -+ -+ case BLENDFACTOR_ONE: -+ result.x = _simd_set1_ps(1.0); -+ result.y = _simd_set1_ps(1.0); -+ result.z = _simd_set1_ps(1.0); -+ result.w = _simd_set1_ps(1.0); -+ break; -+ -+ case BLENDFACTOR_SRC_COLOR: -+ result = src; -+ break; -+ -+ case BLENDFACTOR_DST_COLOR: -+ result = dst; -+ break; -+ -+ case BLENDFACTOR_INV_SRC_COLOR: -+ result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x); -+ result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y); -+ result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z); -+ result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w); -+ break; -+ -+ case BLENDFACTOR_INV_DST_COLOR: -+ result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x); -+ result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y); -+ result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z); -+ result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); -+ break; -+ -+ case BLENDFACTOR_SRC_ALPHA: result.x = src.w; -+ result.y = src.w; -+ result.z = src.w; -+ result.w = src.w; -+ break; -+ -+ case BLENDFACTOR_INV_SRC_ALPHA: -+ { -+ simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w); -+ result.x = oneMinusSrcA; -+ result.y = oneMinusSrcA; -+ result.z = oneMinusSrcA; -+ result.w = oneMinusSrcA; -+ break; -+ } -+ -+ case BLENDFACTOR_DST_ALPHA: result.x = dst.w; -+ result.y = dst.w; -+ result.z = dst.w; -+ result.w = dst.w; -+ break; -+ -+ case BLENDFACTOR_INV_DST_ALPHA: -+ { -+ simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); -+ result.x = oneMinusDstA; -+ result.y = oneMinusDstA; -+ result.z = oneMinusDstA; -+ result.w = oneMinusDstA; -+ break; -+ } -+ -+ case BLENDFACTOR_SRC_ALPHA_SATURATE: -+ { -+ simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w)); -+ result.x = sat; -+ result.y = sat; -+ result.z = sat; -+ result.w = _simd_set1_ps(1.0); -+ break; -+ } -+ -+ case BLENDFACTOR_CONST_COLOR: -+ result.x = constantColor[0]; -+ result.y = constantColor[1]; -+ result.z = constantColor[2]; -+ result.w = constantColor[3]; -+ break; -+ -+ case BLENDFACTOR_CONST_ALPHA: -+ result.x = result.y = result.z = result.w = constantColor[3]; -+ break; -+ -+ case BLENDFACTOR_INV_CONST_COLOR: -+ { -+ result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]); -+ result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]); -+ result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]); -+ result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); -+ break; -+ } -+ -+ case BLENDFACTOR_INV_CONST_ALPHA: -+ { -+ result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); -+ break; -+ } -+ -+ case BLENDFACTOR_SRC1_COLOR: -+ result.x = src1.x; -+ result.y = src1.y; -+ result.z = src1.z; -+ result.w = src1.w; -+ break; -+ -+ case BLENDFACTOR_SRC1_ALPHA: -+ result.x = result.y = result.z = result.w = src1.w; -+ break; -+ -+ case BLENDFACTOR_INV_SRC1_COLOR: -+ result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x); -+ result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y); -+ result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z); -+ result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w); -+ break; -+ -+ case BLENDFACTOR_INV_SRC1_ALPHA: -+ result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w); -+ break; -+ -+ default: SWR_ASSERT(false, "Unimplemented blend factor: %d", func); -+ } -+ -+ if (Color) -+ { -+ out.x = result.x; -+ out.y = result.y; -+ out.z = result.z; -+ } -+ if (Alpha) -+ { -+ out.w = result.w; -+ } -+ -+} -+ -+template -+INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFactor, simdvector &dst, simdvector &dstFactor, simdvector &out) -+{ -+ simdvector result; -+ -+ switch (blendOp) -+ { -+ case BLENDOP_ADD: -+ result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x)); -+ result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y)); -+ result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z)); -+ result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w)); -+ break; -+ -+ case BLENDOP_SUBTRACT: -+ result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x)); -+ result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y)); -+ result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z)); -+ result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w)); -+ break; -+ -+ case BLENDOP_REVSUBTRACT: -+ result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x)); -+ result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y)); -+ result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z)); -+ result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w)); -+ break; -+ -+ case BLENDOP_MIN: -+ result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); -+ result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); -+ result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); -+ result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); -+ break; -+ -+ case BLENDOP_MAX: -+ result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); -+ result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); -+ result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); -+ result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); -+ break; -+ -+ default: -+ SWR_ASSERT(false, "Unimplemented blend function: %d", blendOp); -+ } -+ -+ if (Color) -+ { -+ out.x = result.x; -+ out.y = result.y; -+ out.z = result.z; -+ } -+ if (Alpha) -+ { -+ out.w = result.w; -+ } -+} -+ -+template -+INLINE void Clamp(simdvector &src) -+{ -+ switch (type) -+ { -+ case SWR_TYPE_FLOAT: -+ break; -+ -+ case SWR_TYPE_UNORM: -+ src.x = _simd_max_ps(src.x, _simd_setzero_ps()); -+ src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f)); -+ -+ src.y = _simd_max_ps(src.y, _simd_setzero_ps()); -+ src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f)); -+ -+ src.z = _simd_max_ps(src.z, _simd_setzero_ps()); -+ src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f)); -+ -+ src.w = _simd_max_ps(src.w, _simd_setzero_ps()); -+ src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f)); -+ break; -+ -+ case SWR_TYPE_SNORM: -+ src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f)); -+ src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f)); -+ -+ src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f)); -+ src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f)); -+ -+ src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f)); -+ src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f)); -+ -+ src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f)); -+ src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f)); -+ break; -+ -+ default: -+ SWR_ASSERT(false, "Unimplemented clamp: %d", type); -+ break; -+ } -+} -+ -+template -+void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, BYTE *pDst, simdvector &result) -+{ -+ // load render target -+ simdvector dst; -+ LoadSOA(pDst, dst); -+ -+ simdvector constColor; -+ constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]); -+ constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]); -+ constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]); -+ constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]); -+ -+ // clamp src/dst/constant -+ Clamp(src); -+ Clamp(src1); -+ Clamp(dst); -+ Clamp(constColor); -+ -+ simdvector srcFactor, dstFactor; -+ if (pBlendState->independentAlphaBlendEnable) -+ { -+ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); -+ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, constColor, src, src1, dst, srcFactor); -+ -+ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); -+ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor); -+ -+ BlendFunc((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); -+ BlendFunc((SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result); -+ } -+ else -+ { -+ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); -+ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); -+ -+ BlendFunc((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); -+ } -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp -new file mode 100644 -index 0000000..ce27bf7 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp -@@ -0,0 +1,201 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file clip.cpp -+* -+* @brief Implementation for clipping -+* -+******************************************************************************/ -+ -+#include -+ -+#include "common/os.h" -+#include "core/clip.h" -+ -+float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) -+{ -+ return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1)); -+} -+ -+template -+inline void intersect( -+ int s, // index to first edge vertex v0 in pInPts. -+ int p, // index to second edge vertex v1 in pInPts. -+ const float *pInPts, // array of all the input positions. -+ const float *pInAttribs, // array of all attributes for all vertex. All the attributes for each vertex is contiguous. -+ int numInAttribs, // number of attributes per vertex. -+ int i, // output index. -+ float *pOutPts, // array of output positions. We'll write our new intersection point at i*4. -+ float *pOutAttribs) // array of output attributes. We'll write our new attributes at i*numInAttribs. -+{ -+ float t; -+ -+ // Find the parameter of the intersection. -+ // t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc. -+ const float *v1 = &pInPts[s*4]; -+ const float *v2 = &pInPts[p*4]; -+ -+ switch (ClippingPlane) -+ { -+ case FRUSTUM_LEFT: t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break; -+ case FRUSTUM_RIGHT: t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break; -+ case FRUSTUM_TOP: t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break; -+ case FRUSTUM_BOTTOM: t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break; -+ case FRUSTUM_NEAR: t = ComputeInterpFactor(v1[2], v2[2]); break; -+ case FRUSTUM_FAR: t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break; -+ default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); -+ }; -+ -+ -+ const float *a1 = &pInAttribs[s*numInAttribs]; -+ const float *a2 = &pInAttribs[p*numInAttribs]; -+ -+ float *pOutP = &pOutPts[i*4]; -+ float *pOutA = &pOutAttribs[i*numInAttribs]; -+ -+ // Interpolate new position. -+ for(int j = 0; j < 4; ++j) -+ { -+ pOutP[j] = v1[j] + (v2[j]-v1[j])*t; -+ } -+ -+ // Interpolate Attributes -+ for(int attr = 0; attr < numInAttribs; ++attr) -+ { -+ pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t; -+ } -+} -+ -+ -+// Checks whether vertex v lies inside clipping plane -+// in homogenous coords check -w < {x,y,z} < w; -+// -+template -+inline int inside(const float v[4]) -+{ -+ switch (ClippingPlane) -+ { -+ case FRUSTUM_LEFT : return (v[0]>=-v[3]); -+ case FRUSTUM_RIGHT : return (v[0]<= v[3]); -+ case FRUSTUM_TOP : return (v[1]>=-v[3]); -+ case FRUSTUM_BOTTOM : return (v[1]<= v[3]); -+ case FRUSTUM_NEAR : return (v[2]>=0.0f); -+ case FRUSTUM_FAR : return (v[2]<= v[3]); -+ default: -+ SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); -+ return 0; -+ } -+} -+ -+ -+// Clips a polygon in homogenous coordinates to a particular clipping plane. -+// Takes in vertices of the polygon (InPts) and the clipping plane -+// Puts the vertices of the clipped polygon in OutPts -+// Returns number of points in clipped polygon -+// -+template -+int ClipTriToPlane( const float *pInPts, int numInPts, -+ const float *pInAttribs, int numInAttribs, -+ float *pOutPts, float *pOutAttribs) -+{ -+ int i=0; // index number of OutPts, # of vertices in OutPts = i div 4; -+ -+ for (int j = 0; j < numInPts; ++j) -+ { -+ int s = j; -+ int p = (j + 1) % numInPts; -+ -+ int s_in = inside(&pInPts[s*4]); -+ int p_in = inside(&pInPts[p*4]); -+ -+ // test if vertex is to be added to output vertices -+ if (s_in != p_in) // edge crosses clipping plane -+ { -+ // find point of intersection -+ intersect(s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs); -+ i++; -+ } -+ if (p_in) // 2nd vertex is inside clipping volume, add it to output -+ { -+ // Copy 2nd vertex position of edge over to output. -+ for(int k = 0; k < 4; ++k) -+ { -+ pOutPts[i*4 + k] = pInPts[p*4 + k]; -+ } -+ // Copy 2nd vertex attributes of edge over to output. -+ for(int attr = 0; attr < numInAttribs; ++attr) -+ { -+ pOutAttribs[i*numInAttribs+attr] = pInAttribs[p*numInAttribs+attr]; -+ } -+ i++; -+ } -+ // edge does not cross clipping plane and vertex outside clipping volume -+ // => do not add vertex -+ } -+ return i; -+} -+ -+ -+ -+void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs) -+{ -+ // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping -+ OSALIGN(float, 16) tempPts[6 * 4]; -+ OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4]; -+ -+ // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision -+ int NumOutPts = ClipTriToPlane(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs); -+ NumOutPts = ClipTriToPlane(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); -+ NumOutPts = ClipTriToPlane(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs); -+ NumOutPts = ClipTriToPlane(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); -+ NumOutPts = ClipTriToPlane(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs); -+ NumOutPts = ClipTriToPlane(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); -+ -+ SWR_ASSERT(NumOutPts <= 6); -+ -+ *numVerts = NumOutPts; -+ return; -+} -+ -+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) -+{ -+ RDTSC_START(FEClipTriangles); -+ Clipper<3> clipper(workerId, pDC); -+ clipper.ExecuteStage(pa, prims, primMask, primId); -+ RDTSC_STOP(FEClipTriangles, 1, 0); -+} -+ -+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) -+{ -+ RDTSC_START(FEClipLines); -+ Clipper<2> clipper(workerId, pDC); -+ clipper.ExecuteStage(pa, prims, primMask, primId); -+ RDTSC_STOP(FEClipLines, 1, 0); -+} -+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) -+{ -+ RDTSC_START(FEClipPoints); -+ Clipper<1> clipper(workerId, pDC); -+ clipper.ExecuteStage(pa, prims, primMask, primId); -+ RDTSC_STOP(FEClipPoints, 1, 0); -+} -+ -diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h -new file mode 100644 -index 0000000..e9ba71d ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h -@@ -0,0 +1,851 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file clip.h -+* -+* @brief Definitions for clipping -+* -+******************************************************************************/ -+#pragma once -+ -+#include "common/simdintrin.h" -+#include "core/context.h" -+#include "core/pa.h" -+#include "rdtsc_core.h" -+ -+enum SWR_CLIPCODES -+{ -+ // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. -+ // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes. -+#define CLIPCODE_SHIFT 23 -+ FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT), -+ FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT), -+ FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT), -+ FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT), -+ -+ FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT), -+ FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT), -+ -+ NEGW = (0x40 << CLIPCODE_SHIFT), -+ -+ GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1), -+ GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2), -+ GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4), -+ GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8) -+}; -+ -+#define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR) -+#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW) -+ -+void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, -+ int *numVerts, float *pOutAttribs); -+ -+INLINE -+void ComputeClipCodes(DRIVER_TYPE type, const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes) -+{ -+ clipCodes = _simd_setzero_ps(); -+ -+ // -w -+ simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f)); -+ -+ // FRUSTUM_LEFT -+ simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW); -+ clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT))); -+ -+ // FRUSTUM_TOP -+ vRes = _simd_cmplt_ps(vertex.y, vNegW); -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP)))); -+ -+ // FRUSTUM_RIGHT -+ vRes = _simd_cmpgt_ps(vertex.x, vertex.w); -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT)))); -+ -+ // FRUSTUM_BOTTOM -+ vRes = _simd_cmpgt_ps(vertex.y, vertex.w); -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM)))); -+ -+ if (state.rastState.depthClipEnable) -+ { -+ // FRUSTUM_NEAR -+ // DX clips depth [0..w], GL clips [-w..w] -+ if (type == DX) -+ { -+ vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps()); -+ } -+ else -+ { -+ vRes = _simd_cmplt_ps(vertex.z, vNegW); -+ } -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR)))); -+ -+ // FRUSTUM_FAR -+ vRes = _simd_cmpgt_ps(vertex.z, vertex.w); -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR)))); -+ } -+ -+ // NEGW -+ vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps()); -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW)))); -+ -+ // GUARDBAND_LEFT -+ simdscalar gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.left)); -+ vRes = _simd_cmplt_ps(vertex.x, gbMult); -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT)))); -+ -+ // GUARDBAND_TOP -+ gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.top)); -+ vRes = _simd_cmplt_ps(vertex.y, gbMult); -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP)))); -+ -+ // GUARDBAND_RIGHT -+ gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.right)); -+ vRes = _simd_cmpgt_ps(vertex.x, gbMult); -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT)))); -+ -+ // GUARDBAND_BOTTOM -+ gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.bottom)); -+ vRes = _simd_cmpgt_ps(vertex.y, gbMult); -+ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM)))); -+} -+ -+template -+class Clipper -+{ -+public: -+ Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) : -+ workerId(in_workerId), driverType(in_pDC->pContext->driverType), pDC(in_pDC), state(GetApiState(in_pDC)) -+ { -+ static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim"); -+ } -+ -+ void ComputeClipCodes(simdvector vertex[]) -+ { -+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i) -+ { -+ ::ComputeClipCodes(this->driverType, this->state, vertex[i], this->clipCodes[i]); -+ } -+ } -+ -+ simdscalar ComputeClipCodeIntersection() -+ { -+ simdscalar result = this->clipCodes[0]; -+ for (uint32_t i = 1; i < NumVertsPerPrim; ++i) -+ { -+ result = _simd_and_ps(result, this->clipCodes[i]); -+ } -+ return result; -+ } -+ -+ simdscalar ComputeClipCodeUnion() -+ { -+ simdscalar result = this->clipCodes[0]; -+ for (uint32_t i = 1; i < NumVertsPerPrim; ++i) -+ { -+ result = _simd_or_ps(result, this->clipCodes[i]); -+ } -+ return result; -+ } -+ -+ int ComputeNegWMask() -+ { -+ simdscalar clipCodeUnion = ComputeClipCodeUnion(); -+ clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW))); -+ return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps())); -+ } -+ -+ int ComputeClipMask() -+ { -+ simdscalar clipUnion = ComputeClipCodeUnion(); -+ clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK))); -+ return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps())); -+ } -+ -+ // clipper is responsible for culling any prims with NAN coordinates -+ int ComputeNaNMask(simdvector prim[]) -+ { -+ simdscalar vNanMask = _simd_setzero_ps(); -+ for (uint32_t e = 0; e < NumVertsPerPrim; ++e) -+ { -+ simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q); -+ vNanMask = _simd_or_ps(vNanMask, vNan01); -+ simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q); -+ vNanMask = _simd_or_ps(vNanMask, vNan23); -+ } -+ -+ return _simd_movemask_ps(vNanMask); -+ } -+ -+ int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[]) -+ { -+ uint8_t cullMask = this->state.rastState.cullDistanceMask; -+ simdscalar vClipCullMask = _simd_setzero_ps(); -+ DWORD index; -+ -+ simdvector vClipCullDistLo[3]; -+ simdvector vClipCullDistHi[3]; -+ -+ pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo); -+ pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi); -+ while (_BitScanForward(&index, cullMask)) -+ { -+ cullMask &= ~(1 << index); -+ uint32_t slot = index >> 2; -+ uint32_t component = index & 0x3; -+ -+ simdscalar vCullMaskElem = _simd_set1_ps(-1.0f); -+ for (uint32_t e = 0; e < NumVertsPerPrim; ++e) -+ { -+ simdscalar vCullComp; -+ if (slot == 0) -+ { -+ vCullComp = vClipCullDistLo[e][component]; -+ } -+ else -+ { -+ vCullComp = vClipCullDistHi[e][component]; -+ } -+ -+ // cull if cull distance < 0 || NAN -+ simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ); -+ vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull); -+ } -+ vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem); -+ } -+ -+ // clipper should also discard any primitive with NAN clip distance -+ uint8_t clipMask = this->state.rastState.clipDistanceMask; -+ while (_BitScanForward(&index, clipMask)) -+ { -+ clipMask &= ~(1 << index); -+ uint32_t slot = index >> 2; -+ uint32_t component = index & 0x3; -+ -+ for (uint32_t e = 0; e < NumVertsPerPrim; ++e) -+ { -+ simdscalar vClipComp; -+ if (slot == 0) -+ { -+ vClipComp = vClipCullDistLo[e][component]; -+ } -+ else -+ { -+ vClipComp = vClipCullDistHi[e][component]; -+ } -+ -+ simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q); -+ vClipCullMask = _simd_or_ps(vClipCullMask, vClip); -+ } -+ } -+ -+ return _simd_movemask_ps(vClipCullMask); -+ } -+ -+ // clip a single primitive -+ int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs) -+ { -+ OSALIGN(float, 16) inVerts[3 * 4]; -+ OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4]; -+ -+ // transpose primitive position -+ __m128 verts[3]; -+ pa.AssembleSingle(VERTEX_POSITION_SLOT, primIndex, verts); -+ _mm_store_ps(&inVerts[0], verts[0]); -+ _mm_store_ps(&inVerts[4], verts[1]); -+ _mm_store_ps(&inVerts[8], verts[2]); -+ -+ // transpose attribs -+ uint32_t numScalarAttribs = this->state.linkageCount * 4; -+ -+ int idx = 0; -+ DWORD slot = 0; -+ uint32_t mapIdx = 0; -+ uint32_t tmpLinkage = uint32_t(this->state.linkageMask); -+ while (_BitScanForward(&slot, tmpLinkage)) -+ { -+ tmpLinkage &= ~(1 << slot); -+ // Compute absolute attrib slot in vertex array -+ uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++]; -+ __m128 attrib[3]; // triangle attribs (always 4 wide) -+ pa.AssembleSingle(inputSlot, primIndex, attrib); -+ _mm_store_ps(&inAttribs[idx], attrib[0]); -+ _mm_store_ps(&inAttribs[idx + numScalarAttribs], attrib[1]); -+ _mm_store_ps(&inAttribs[idx + numScalarAttribs * 2], attrib[2]); -+ idx += 4; -+ } -+ -+ int numVerts; -+ Clip(inVerts, inAttribs, numScalarAttribs, pOutPos, &numVerts, pOutAttribs); -+ -+ return numVerts; -+ } -+ -+ // clip SIMD primitives -+ void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId) -+ { -+ // input/output vertex store for clipper -+ simdvertex vertices[7]; // maximum 7 verts generated per triangle -+ -+ // assemble pos -+ simdvector tmpVector[NumVertsPerPrim]; -+ pa.Assemble(VERTEX_POSITION_SLOT, tmpVector); -+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i) -+ { -+ vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i]; -+ } -+ -+ // assemble attribs -+ DWORD slot = 0; -+ uint32_t mapIdx = 0; -+ uint32_t tmpLinkage = this->state.linkageMask; -+ while (_BitScanForward(&slot, tmpLinkage)) -+ { -+ tmpLinkage &= ~(1 << slot); -+ // Compute absolute attrib slot in vertex array -+ uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++]; -+ -+ pa.Assemble(inputSlot, tmpVector); -+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i) -+ { -+ vertices[i].attrib[inputSlot] = tmpVector[i]; -+ } -+ } -+ -+ uint32_t numAttribs; -+ if (_BitScanReverse((DWORD*)&numAttribs, this->state.linkageMask)) -+ { -+ numAttribs++; -+ } -+ else -+ { -+ numAttribs = 0; -+ } -+ -+ simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); -+ -+ // set up new PA for binning clipped primitives -+ PFN_PROCESS_PRIMS pfnBinFunc = nullptr; -+ PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN; -+ if (NumVertsPerPrim == 3) -+ { -+ pfnBinFunc = BinTriangles; -+ clipTopology = TOP_TRIANGLE_FAN; -+ -+ // so that the binner knows to bloat wide points later -+ if (pa.binTopology == TOP_POINT_LIST) -+ clipTopology = TOP_POINT_LIST; -+ } -+ else if (NumVertsPerPrim == 2) -+ { -+ pfnBinFunc = BinLines; -+ clipTopology = TOP_LINE_LIST; -+ } -+ else -+ { -+ SWR_ASSERT(0 && "Unexpected points in clipper."); -+ } -+ -+ -+ uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts; -+ uint32_t* pPrimitiveId = (uint32_t*)&vPrimId; -+ -+ const simdscalari vOffsets = _mm256_set_epi32( -+ 0 * sizeof(simdvertex), // unused lane -+ 6 * sizeof(simdvertex), -+ 5 * sizeof(simdvertex), -+ 4 * sizeof(simdvertex), -+ 3 * sizeof(simdvertex), -+ 2 * sizeof(simdvertex), -+ 1 * sizeof(simdvertex), -+ 0 * sizeof(simdvertex)); -+ -+ // only need to gather 7 verts -+ // @todo dynamic mask based on actual # of verts generated per lane -+ const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1); -+ -+ uint32_t numClippedPrims = 0; -+ for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim) -+ { -+ uint32_t numEmittedVerts = pVertexCount[inputPrim]; -+ if (numEmittedVerts < NumVertsPerPrim) -+ { -+ continue; -+ } -+ SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper."); -+ -+ uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts); -+ numClippedPrims += numEmittedPrims; -+ -+ // tranpose clipper output so that each lane's vertices are in SIMD order -+ // set aside space for 2 vertices, as the PA will try to read up to 16 verts -+ // for triangle fan -+ simdvertex transposedPrims[2]; -+ -+ // transpose pos -+ uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); -+ pBase += sizeof(simdscalar); -+ } -+ -+ // transpose attribs -+ pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim; -+ for (uint32_t attrib = 0; attrib < numAttribs; ++attrib) -+ { -+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); -+ pBase += sizeof(simdscalar); -+ } -+ } -+ -+ PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology); -+ -+ while (clipPa.GetNextStreamOutput()) -+ { -+ do -+ { -+ simdvector attrib[NumVertsPerPrim]; -+ bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib); -+ if (assemble) -+ { -+ static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff }; -+ pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim])); -+ } -+ } while (clipPa.NextPrim()); -+ } -+ } -+ -+ // update global pipeline stat -+ SWR_CONTEXT* pContext = this->pDC->pContext; -+ UPDATE_STAT(CPrimitives, numClippedPrims); -+ } -+ -+ // execute the clipper stage -+ void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId) -+ { -+ // set up binner based on PA state -+ PFN_PROCESS_PRIMS pfnBinner; -+ switch (pa.binTopology) -+ { -+ case TOP_POINT_LIST: -+ pfnBinner = CanUseSimplePoints(pDC) ? BinPoints : BinTriangles; -+ break; -+ case TOP_LINE_LIST: -+ case TOP_LINE_STRIP: -+ case TOP_LINE_LOOP: -+ case TOP_LINE_LIST_ADJ: -+ case TOP_LISTSTRIP_ADJ: -+ pfnBinner = BinLines; -+ break; -+ default: -+ pfnBinner = BinTriangles; -+ break; -+ }; -+ -+ // update clipper invocations pipeline stat -+ SWR_CONTEXT* pContext = this->pDC->pContext; -+ uint32_t numInvoc = _mm_popcnt_u32(primMask); -+ UPDATE_STAT(CInvocations, numInvoc); -+ -+ ComputeClipCodes(prim); -+ -+ // cull prims with NAN coords -+ primMask &= ~ComputeNaNMask(prim); -+ -+ // user cull distance cull -+ if (this->state.rastState.cullDistanceMask) -+ { -+ primMask &= ~ComputeUserClipCullMask(pa, prim); -+ } -+ -+ // cull prims outside view frustum -+ simdscalar clipIntersection = ComputeClipCodeIntersection(); -+ int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps())); -+ -+ // skip clipping for points -+ uint32_t clipMask = 0; -+ if (NumVertsPerPrim != 1) -+ { -+ clipMask = primMask & ComputeClipMask(); -+ } -+ -+ if (clipMask) -+ { -+ RDTSC_START(FEGuardbandClip); -+ // we have to clip tris, execute the clipper, which will also -+ // call the binner -+ ClipSimd(vMask(primMask), vMask(clipMask), pa, primId); -+ RDTSC_STOP(FEGuardbandClip, 1, 0); -+ } -+ else if (validMask) -+ { -+ // update CPrimitives pipeline state -+ SWR_CONTEXT* pContext = this->pDC->pContext; -+ UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask)); -+ -+ // forward valid prims directly to binner -+ pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId); -+ } -+ } -+ -+private: -+ inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1) -+ { -+ return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1)); -+ } -+ -+ inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component) -+ { -+ const uint32_t simdVertexStride = sizeof(simdvertex); -+ const uint32_t componentStride = sizeof(simdscalar); -+ const uint32_t attribStride = sizeof(simdvector); -+ const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float), -+ 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float)); -+ -+ // step to the simdvertex -+ simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride)); -+ -+ // step to the attribute and component -+ vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component)); -+ -+ // step to the lane -+ vOffsets = _simd_add_epi32(vOffsets, vElemOffset); -+ -+ return vOffsets; -+ } -+ -+ // gathers a single component for a given attribute for each SIMD lane -+ inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component) -+ { -+ simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); -+ simdscalar vSrc = _mm256_undefined_ps(); -+ return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1); -+ } -+ -+ inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc) -+ { -+ simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); -+ -+ uint32_t* pOffsets = (uint32_t*)&vOffsets; -+ float* pSrc = (float*)&vSrc; -+ uint32_t mask = _simd_movemask_ps(vMask); -+ DWORD lane; -+ while (_BitScanForward(&lane, mask)) -+ { -+ mask &= ~(1 << lane); -+ uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane]; -+ *(float*)pBuf = pSrc[lane]; -+ } -+ } -+ -+ template -+ inline void intersect( -+ const simdscalar& vActiveMask, // active lanes to operate on -+ const simdscalari& s, // index to first edge vertex v0 in pInPts. -+ const simdscalari& p, // index to second edge vertex v1 in pInPts. -+ const simdvector& v1, // vertex 0 position -+ const simdvector& v2, // vertex 1 position -+ simdscalari& outIndex, // output index. -+ const float *pInVerts, // array of all the input positions. -+ uint32_t numInAttribs, // number of attributes per vertex. -+ float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4. -+ { -+ // compute interpolation factor -+ simdscalar t; -+ switch (ClippingPlane) -+ { -+ case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break; -+ case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break; -+ case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break; -+ case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break; -+ case FRUSTUM_NEAR: -+ // DX Znear plane is 0, GL is -w -+ if (this->driverType == DX) -+ { -+ t = ComputeInterpFactor(v1[2], v2[2]); -+ } -+ else -+ { -+ t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2])); -+ } -+ break; -+ case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break; -+ default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); -+ }; -+ -+ // interpolate position and store -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]); -+ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos); -+ } -+ -+ // interpolate attributes and store -+ for (uint32_t a = 0; a < numInAttribs; ++a) -+ { -+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); -+ simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); -+ simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); -+ ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); -+ } -+ } -+ } -+ -+ template -+ inline simdscalar inside(const simdvector& v) -+ { -+ switch (ClippingPlane) -+ { -+ case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); -+ case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]); -+ case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); -+ case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]); -+ case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->driverType == DX ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); -+ case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]); -+ default: -+ SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); -+ return _simd_setzero_ps(); -+ } -+ } -+ -+ template -+ simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) -+ { -+ simdscalari vCurIndex = _simd_setzero_si(); -+ simdscalari vOutIndex = _simd_setzero_si(); -+ simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); -+ -+ while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty -+ { -+ simdscalari s = vCurIndex; -+ simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1)); -+ simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p); -+ p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask))); -+ -+ // gather position -+ simdvector vInPos0, vInPos1; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); -+ vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); -+ } -+ -+ // compute inside mask -+ simdscalar s_in = inside(vInPos0); -+ simdscalar p_in = inside(vInPos1); -+ -+ // compute intersection mask (s_in != p_in) -+ simdscalar intersectMask = _simd_xor_ps(s_in, p_in); -+ intersectMask = _simd_and_ps(intersectMask, vActiveMask); -+ -+ // store s if inside -+ s_in = _simd_and_ps(s_in, vActiveMask); -+ if (!_simd_testz_ps(s_in, s_in)) -+ { -+ // store position -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); -+ } -+ -+ // store attribs -+ for (uint32_t a = 0; a < numInAttribs; ++a) -+ { -+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); -+ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); -+ } -+ } -+ -+ // increment outIndex -+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); -+ } -+ -+ // compute and store intersection -+ if (!_simd_testz_ps(intersectMask, intersectMask)) -+ { -+ intersect(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); -+ -+ // increment outIndex for active lanes -+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask); -+ } -+ -+ // increment loop index and update active mask -+ vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1)); -+ vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); -+ } -+ -+ return vOutIndex; -+ } -+ -+ template -+ simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) -+ { -+ simdscalari vCurIndex = _simd_setzero_si(); -+ simdscalari vOutIndex = _simd_setzero_si(); -+ simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); -+ -+ if (!_simd_testz_ps(vActiveMask, vActiveMask)) -+ { -+ simdscalari s = vCurIndex; -+ simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1)); -+ -+ // gather position -+ simdvector vInPos0, vInPos1; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); -+ vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); -+ } -+ -+ // compute inside mask -+ simdscalar s_in = inside(vInPos0); -+ simdscalar p_in = inside(vInPos1); -+ -+ // compute intersection mask (s_in != p_in) -+ simdscalar intersectMask = _simd_xor_ps(s_in, p_in); -+ intersectMask = _simd_and_ps(intersectMask, vActiveMask); -+ -+ // store s if inside -+ s_in = _simd_and_ps(s_in, vActiveMask); -+ if (!_simd_testz_ps(s_in, s_in)) -+ { -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); -+ } -+ -+ // interpolate attributes and store -+ for (uint32_t a = 0; a < numInAttribs; ++a) -+ { -+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); -+ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); -+ } -+ } -+ -+ // increment outIndex -+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); -+ } -+ -+ // compute and store intersection -+ if (!_simd_testz_ps(intersectMask, intersectMask)) -+ { -+ intersect(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); -+ -+ // increment outIndex for active lanes -+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask); -+ } -+ -+ // store p if inside -+ p_in = _simd_and_ps(p_in, vActiveMask); -+ if (!_simd_testz_ps(p_in, p_in)) -+ { -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]); -+ } -+ -+ // interpolate attributes and store -+ for (uint32_t a = 0; a < numInAttribs; ++a) -+ { -+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c); -+ ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib); -+ } -+ } -+ -+ // increment outIndex -+ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in); -+ } -+ } -+ -+ return vOutIndex; -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Vertical clipper. Clips SIMD primitives at a time -+ /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer -+ /// @param vPrimMask - mask of valid input primitives, including non-clipped prims -+ /// @param numAttribs - number of valid input attribs, including position -+ simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs) -+ { -+ // temp storage -+ simdvertex tempVertices[7]; -+ float* pTempVerts = (float*)&tempVertices[0]; -+ -+ // zero out num input verts for non-active lanes -+ simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim); -+ vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask); -+ -+ // clip prims to frustum -+ simdscalari vNumOutPts; -+ if (NumVertsPerPrim == 3) -+ { -+ vNumOutPts = ClipTriToPlane(pVertices, vNumInPts, numAttribs, pTempVerts); -+ vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); -+ vNumOutPts = ClipTriToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); -+ vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); -+ vNumOutPts = ClipTriToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); -+ vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); -+ } -+ else -+ { -+ SWR_ASSERT(NumVertsPerPrim == 2); -+ vNumOutPts = ClipLineToPlane(pVertices, vNumInPts, numAttribs, pTempVerts); -+ vNumOutPts = ClipLineToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); -+ vNumOutPts = ClipLineToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); -+ vNumOutPts = ClipLineToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); -+ vNumOutPts = ClipLineToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); -+ vNumOutPts = ClipLineToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); -+ } -+ -+ // restore num verts for non-clipped, active lanes -+ simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask); -+ vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask); -+ -+ return vNumOutPts; -+ } -+ -+ const uint32_t workerId; -+ const DRIVER_TYPE driverType; -+ DRAW_CONTEXT* pDC; -+ const API_STATE& state; -+ simdscalar clipCodes[NumVertsPerPrim]; -+}; -+ -+ -+// pipeline stage functions -+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); -+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); -+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); -diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h -new file mode 100644 -index 0000000..c719f27 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/context.h -@@ -0,0 +1,444 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file context.h -+* -+* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT -+* The SWR_CONTEXT is our global context and contains the DC ring, -+* thread state, etc. -+* -+* The DRAW_CONTEXT contains all state associated with a draw operation. -+* -+******************************************************************************/ -+#pragma once -+ -+#include -+#include -+ -+#include "core/api.h" -+#include "core/utils.h" -+#include "core/arena.h" -+#include "core/fifo.hpp" -+#include "core/knobs.h" -+#include "common/simdintrin.h" -+#include "core/threads.h" -+ -+// x.8 fixed point precision values -+#define FIXED_POINT_SHIFT 8 -+#define FIXED_POINT_SCALE 256 -+ -+// x.16 fixed point precision values -+#define FIXED_POINT16_SHIFT 16 -+#define FIXED_POINT16_SCALE 65536 -+ -+struct SWR_CONTEXT; -+struct DRAW_CONTEXT; -+ -+struct TRI_FLAGS -+{ -+ uint32_t frontFacing : 1; -+ uint32_t yMajor : 1; -+ uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); -+ uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); -+ uint32_t primID; -+ uint32_t renderTargetArrayIndex; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_TRIANGLE_DESC -+///////////////////////////////////////////////////////////////////////// -+struct SWR_TRIANGLE_DESC -+{ -+ float I[3]; -+ float J[3]; -+ float Z[3]; -+ float OneOverW[3]; -+ float recipDet; -+ -+ float *pAttribs; -+ float *pPerspAttribs; -+ float *pSamplePos; -+ float *pUserClipBuffer; -+ -+ uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; -+ -+ TRI_FLAGS triFlags; -+}; -+ -+struct TRIANGLE_WORK_DESC -+{ -+ float *pTriBuffer; -+ float *pAttribs; -+ float *pUserClipBuffer; -+ uint32_t numAttribs; -+ TRI_FLAGS triFlags; -+}; -+ -+union CLEAR_FLAGS -+{ -+ struct -+ { -+ uint32_t mask : 3; -+ }; -+ uint32_t bits; -+}; -+ -+struct CLEAR_DESC -+{ -+ CLEAR_FLAGS flags; -+ float clearRTColor[4]; // RGBA_32F -+ float clearDepth; // [0..1] -+ BYTE clearStencil; -+}; -+ -+struct INVALIDATE_TILES_DESC -+{ -+ uint32_t attachmentMask; -+}; -+ -+struct SYNC_DESC -+{ -+ PFN_CALLBACK_FUNC pfnCallbackFunc; -+ uint64_t userData; -+ uint64_t userData2; -+}; -+ -+struct QUERY_DESC -+{ -+ SWR_STATS* pStats; -+}; -+ -+struct STORE_TILES_DESC -+{ -+ SWR_RENDERTARGET_ATTACHMENT attachment; -+ SWR_TILE_STATE postStoreTileState; -+}; -+ -+struct COMPUTE_DESC -+{ -+ uint32_t threadGroupCountX; -+ uint32_t threadGroupCountY; -+ uint32_t threadGroupCountZ; -+}; -+ -+typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc); -+ -+enum WORK_TYPE -+{ -+ SYNC, -+ DRAW, -+ CLEAR, -+ INVALIDATETILES, -+ STORETILES, -+ QUERYSTATS, -+}; -+ -+struct BE_WORK -+{ -+ WORK_TYPE type; -+ PFN_WORK_FUNC pfnWork; -+ union -+ { -+ SYNC_DESC sync; -+ TRIANGLE_WORK_DESC tri; -+ CLEAR_DESC clear; -+ INVALIDATE_TILES_DESC invalidateTiles; -+ STORE_TILES_DESC storeTiles; -+ QUERY_DESC queryStats; -+ } desc; -+}; -+ -+struct DRAW_WORK -+{ -+ DRAW_CONTEXT* pDC; -+ union -+ { -+ uint32_t numIndices; // DrawIndexed: Number of indices for draw. -+ uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc) -+ }; -+ union -+ { -+ const int32_t* pIB; // DrawIndexed: App supplied indices -+ uint32_t startVertex; // Draw: Starting vertex in VB to render from. -+ }; -+ int32_t baseVertex; -+ uint32_t numInstances; // Number of instances -+ uint32_t startInstance; // Instance offset -+ uint32_t startPrimID; // starting primitiveID for this draw batch -+ SWR_FORMAT type; // index buffer type -+}; -+ -+typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc); -+struct FE_WORK -+{ -+ WORK_TYPE type; -+ PFN_FE_WORK_FUNC pfnWork; -+ union -+ { -+ SYNC_DESC sync; -+ DRAW_WORK draw; -+ CLEAR_DESC clear; -+ INVALIDATE_TILES_DESC invalidateTiles; -+ STORE_TILES_DESC storeTiles; -+ QUERY_DESC queryStats; -+ } desc; -+}; -+ -+struct GUARDBAND -+{ -+ float left, right, top, bottom; -+}; -+ -+struct PA_STATE; -+ -+// function signature for pipeline stages that execute after primitive assembly -+typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], -+ uint32_t primMask, simdscalari primID); -+ -+OSALIGNLINE(struct) API_STATE -+{ -+ // Vertex Buffers -+ SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS]; -+ -+ // Index Buffer -+ SWR_INDEX_BUFFER_STATE indexBuffer; -+ -+ // FS - Fetch Shader State -+ PFN_FETCH_FUNC pfnFetchFunc; -+ -+ // VS - Vertex Shader State -+ PFN_VERTEX_FUNC pfnVertexFunc; -+ -+ // GS - Geometry Shader State -+ PFN_GS_FUNC pfnGsFunc; -+ SWR_GS_STATE gsState; -+ -+ // CS - Compute Shader -+ PFN_CS_FUNC pfnCsFunc; -+ uint32_t totalThreadsInGroup; -+ -+ // FE - Frontend State -+ SWR_FRONTEND_STATE frontendState; -+ -+ // SOS - Streamout Shader State -+ PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS]; -+ -+ // Streamout state -+ SWR_STREAMOUT_STATE soState; -+ mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS]; -+ -+ // Tessellation State -+ PFN_HS_FUNC pfnHsFunc; -+ PFN_DS_FUNC pfnDsFunc; -+ SWR_TS_STATE tsState; -+ -+ // Specifies which VS outputs are sent to PS. -+ // Does not include position -+ uint32_t linkageMask; -+ uint32_t linkageCount; -+ uint8_t linkageMap[MAX_ATTRIBUTES]; -+ -+ // attrib mask, specifies the total set of attributes used -+ // by the frontend (vs, so, gs) -+ uint32_t feAttribMask; -+ -+ PRIMITIVE_TOPOLOGY topology; -+ bool forceFront; -+ -+ // RS - Rasterizer State -+ SWR_RASTSTATE rastState; -+ // floating point multisample offsets -+ float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2]; -+ -+ GUARDBAND gbState; -+ -+ SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; -+ SWR_VIEWPORT_MATRIX vpMatrix[KNOB_NUM_VIEWPORTS_SCISSORS]; -+ -+ BBOX scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; -+ BBOX scissorInFixedPoint; -+ -+ // Backend state -+ SWR_BACKEND_STATE backendState; -+ -+ // PS - Pixel shader state -+ SWR_PS_STATE psState; -+ -+ SWR_DEPTH_STENCIL_STATE depthStencilState; -+ -+ // OM - Output Merger State -+ SWR_BLEND_STATE blendState; -+ PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS]; -+ -+ // Stats are incremented when this is true. -+ bool enableStats; -+}; -+ -+class MacroTileMgr; -+class DispatchQueue; -+ -+struct RenderOutputBuffers -+{ -+ uint8_t* pColor[SWR_NUM_RENDERTARGETS]; -+ uint8_t* pDepth; -+ uint8_t* pStencil; -+}; -+ -+// pipeline function pointer types -+typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&); -+ -+// Draw State -+struct DRAW_STATE -+{ -+ API_STATE state; -+ -+ void* pPrivateState; // Its required the driver sets this up for each draw. -+ -+ // pipeline function pointers, filled in by API thread when setting up the draw -+ PFN_BACKEND_FUNC pfnBackend; -+ PFN_PROCESS_PRIMS pfnProcessPrims; -+ -+ Arena arena; // This should only be used by API thread. -+}; -+ -+// Draw Context -+// The api thread sets up a draw context that exists for the life of the draw. -+// This draw context maintains all of the state needed for the draw operation. -+struct DRAW_CONTEXT -+{ -+ SWR_CONTEXT *pContext; -+ -+ uint64_t drawId; -+ -+ bool isCompute; // Is this DC a compute context? -+ -+ FE_WORK FeWork; -+ volatile OSALIGNLINE(uint32_t) FeLock; -+ volatile OSALIGNLINE(bool) inUse; -+ volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? -+ -+ uint64_t dependency; -+ -+ MacroTileMgr* pTileMgr; -+ -+ // The following fields are valid if isCompute is true. -+ volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute) -+ DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) -+ -+ DRAW_STATE* pState; -+ Arena arena; -+}; -+ -+INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) -+{ -+ SWR_ASSERT(pDC != nullptr); -+ SWR_ASSERT(pDC->pState != nullptr); -+ -+ return pDC->pState->state; -+} -+ -+INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC) -+{ -+ SWR_ASSERT(pDC != nullptr); -+ SWR_ASSERT(pDC->pState != nullptr); -+ -+ return pDC->pState->pPrivateState; -+} -+ -+class HotTileMgr; -+ -+struct SWR_CONTEXT -+{ -+ // Draw Context Ring -+ // Each draw needs its own state in order to support mulitple draws in flight across multiple threads. -+ // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number -+ // of draws that can be in flight at any given time. -+ // -+ // Description: -+ // 1. State - When an application first sets state we'll request a new draw context to use. -+ // a. If there are no available draw contexts then we'll have to wait until one becomes free. -+ // b. If one is available then set pCurDrawContext to point to it and mark it in use. -+ // c. All state calls set state on pCurDrawContext. -+ // 2. Draw - Creates submits a work item that is associated with current draw context. -+ // a. Set pPrevDrawContext = pCurDrawContext -+ // b. Set pCurDrawContext to NULL. -+ // 3. State - When an applications sets state after draw -+ // a. Same as step 1. -+ // b. State is copied from prev draw context to current. -+ DRAW_CONTEXT* dcRing; -+ -+ DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. -+ DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. -+ -+ // Draw State Ring -+ // When draw are very large (lots of primitives) then the API thread will break these up. -+ // These split draws all have identical state. So instead of storing the state directly -+ // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs -+ // to reference a single entry in the DS ring. -+ DRAW_STATE* dsRing; -+ -+ uint32_t curStateId; // Current index to the next available entry in the DS ring. -+ -+ uint32_t NumWorkerThreads; -+ -+ THREAD_POOL threadPool; // Thread pool associated with this context -+ -+ std::condition_variable FifosNotEmpty; -+ std::mutex WaitLock; -+ -+ // Draw Contexts will get a unique drawId generated from this -+ uint64_t nextDrawId; -+ -+ // Last retired drawId. Read/written only be API thread -+ uint64_t LastRetiredId; -+ -+ // most recent draw id enqueued by the API thread -+ // written by api thread, read by multiple workers -+ OSALIGNLINE(volatile uint64_t) DrawEnqueued; -+ -+ // Current FE status of each worker. -+ OSALIGNLINE(volatile uint64_t) WorkerFE[KNOB_MAX_NUM_THREADS]; -+ OSALIGNLINE(volatile uint64_t) WorkerBE[KNOB_MAX_NUM_THREADS]; -+ -+ DRIVER_TYPE driverType; -+ -+ uint32_t privateStateSize; -+ -+ HotTileMgr *pHotTileMgr; -+ -+ // tile load/store functions, passed in at create context time -+ PFN_LOAD_TILE pfnLoadTile; -+ PFN_STORE_TILE pfnStoreTile; -+ PFN_CLEAR_TILE pfnClearTile; -+ -+ // Global Stats -+ SWR_STATS stats[KNOB_MAX_NUM_THREADS]; -+ -+ // Scratch space for workers. -+ uint8_t* pScratch[KNOB_MAX_NUM_THREADS]; -+}; -+ -+void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId); -+void WakeAllThreads(SWR_CONTEXT *pContext); -+ -+#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name += count; } -+#define SET_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name = count; } -diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h -new file mode 100644 -index 0000000..9f869ec ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h -@@ -0,0 +1,215 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file depthstencil.h -+* -+* @brief Implements depth/stencil functionality -+* -+******************************************************************************/ -+#pragma once -+#include "common/os.h" -+#include "format_conversion.h" -+ -+INLINE -+void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simdscalar &stencilps) -+{ -+ simdscalari stencil = _simd_castps_si(stencilps); -+ -+ switch (op) -+ { -+ case STENCILOP_KEEP: -+ break; -+ case STENCILOP_ZERO: -+ stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask); -+ break; -+ case STENCILOP_REPLACE: -+ stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask); -+ break; -+ case STENCILOP_INCRSAT: -+ { -+ simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1)); -+ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); -+ break; -+ } -+ case STENCILOP_DECRSAT: -+ { -+ simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1)); -+ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); -+ break; -+ } -+ case STENCILOP_INCR: -+ { -+ simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1)); -+ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); -+ break; -+ } -+ case STENCILOP_DECR: -+ { -+ simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff)); -+ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); -+ break; -+ } -+ case STENCILOP_INVERT: -+ { -+ simdscalar stencilinvert = _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps())); -+ stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask); -+ break; -+ } -+ default: -+ break; -+ } -+} -+ -+ -+INLINE -+simdscalar ZTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, -+ bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar mask, BYTE *pStencilBase, -+ bool testOnly) -+{ -+ static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); -+ static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format"); -+ -+ simdscalar depthResult = _simd_set1_ps(-1.0f); -+ simdscalar zbuf; -+ -+ // clamp Z to viewport [minZ..maxZ] -+ simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); -+ simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); -+ interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ)); -+ -+ if (pDSState->depthTestEnable) -+ { -+ switch (pDSState->depthTestFunc) -+ { -+ case ZFUNC_NEVER: depthResult = _simd_setzero_ps(); break; -+ case ZFUNC_ALWAYS: break; -+ default: -+ zbuf = _simd_load_ps((const float*)pDepthBase); -+ } -+ -+ switch (pDSState->depthTestFunc) -+ { -+ case ZFUNC_LE: depthResult = _simd_cmple_ps(interpZ, zbuf); break; -+ case ZFUNC_LT: depthResult = _simd_cmplt_ps(interpZ, zbuf); break; -+ case ZFUNC_GT: depthResult = _simd_cmpgt_ps(interpZ, zbuf); break; -+ case ZFUNC_GE: depthResult = _simd_cmpge_ps(interpZ, zbuf); break; -+ case ZFUNC_EQ: depthResult = _simd_cmpeq_ps(interpZ, zbuf); break; -+ } -+ } -+ -+ simdscalar stencilMask = _simd_set1_ps(-1.0f); -+ simdscalar stencilbuf; -+ -+ uint8_t stencilRefValue; -+ uint32_t stencilTestFunc; -+ uint32_t stencilFailOp; -+ uint32_t stencilPassDepthPassOp; -+ uint32_t stencilPassDepthFailOp; -+ uint8_t stencilTestMask; -+ uint8_t stencilWriteMask; -+ if (frontFacing || !pDSState->doubleSidedStencilTestEnable) -+ { -+ stencilRefValue = pDSState->stencilRefValue; -+ stencilTestFunc = pDSState->stencilTestFunc; -+ stencilFailOp = pDSState->stencilFailOp; -+ stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp; -+ stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp; -+ stencilTestMask = pDSState->stencilTestMask; -+ stencilWriteMask = pDSState->stencilWriteMask; -+ } -+ else -+ { -+ stencilRefValue = pDSState->backfaceStencilRefValue; -+ stencilTestFunc = pDSState->backfaceStencilTestFunc; -+ stencilFailOp = pDSState->backfaceStencilFailOp; -+ stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp; -+ stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp; -+ stencilTestMask = pDSState->backfaceStencilTestMask; -+ stencilWriteMask = pDSState->backfaceStencilWriteMask; -+ } -+ -+ if (pDSState->stencilTestEnable) -+ { -+ simdvector sbuf; -+ LoadSOA(pStencilBase, sbuf); -+ stencilbuf = sbuf.v[0]; -+ -+ // apply stencil read mask -+ simdscalar stencilWithMask = _simd_castsi_ps(_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask))); -+ -+ // do stencil compare in float to avoid simd integer emulation in AVX1 -+ stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask)); -+ -+ simdscalar stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask)); -+ -+ switch (stencilTestFunc) -+ { -+ case ZFUNC_ALWAYS: break; -+ case ZFUNC_NEVER: stencilMask = _simd_setzero_ps(); break; -+ case ZFUNC_LE: stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); break; -+ case ZFUNC_LT: stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); break; -+ case ZFUNC_GT: stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); break; -+ case ZFUNC_GE: stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); break; -+ case ZFUNC_EQ: stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); break; -+ case ZFUNC_NE: stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); break; -+ } -+ } -+ -+ simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask); -+ depthWriteMask = _simd_and_ps(depthWriteMask, mask); -+ -+ if (testOnly) { -+ return depthWriteMask; -+ } -+ -+ if (pDSState->depthWriteEnable) -+ { -+ _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(depthWriteMask), interpZ); -+ } -+ -+ if (pDSState->stencilWriteEnable) -+ { -+ simdscalar stencilps = stencilbuf; -+ simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue)); -+ -+ simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, mask); -+ simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthResult); -+ simdscalar stencilPassDepthFailMask = _simd_and_ps(stencilMask, _simd_andnot_ps(depthResult, _simd_set1_ps(-1))); -+ -+ simdscalar origStencil = stencilps; -+ -+ StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps); -+ StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, stencilPassDepthFailMask, stencilRefps, stencilps); -+ StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, stencilPassDepthPassMask, stencilRefps, stencilps); -+ -+ // apply stencil write mask -+ simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask); -+ stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask)); -+ stencilps = _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps); -+ -+ simdvector stencilResult; -+ stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, mask); -+ StoreSOA(stencilResult, pStencilBase); -+ } -+ -+ return depthWriteMask; -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp -new file mode 100644 -index 0000000..238f5ee ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp -@@ -0,0 +1,144 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file fifo.hpp -+* -+* @brief Definitions for our fifos used for thread communication. -+* -+******************************************************************************/ -+#pragma once -+ -+#include "common/os.h" -+#include -+#include -+ -+template -+struct QUEUE -+{ -+ OSALIGNLINE(volatile uint32_t) mLock; -+ OSALIGNLINE(volatile uint32_t) mNumEntries; -+ std::vector mBlocks; -+ T* mCurBlock; -+ uint32_t mHead; -+ uint32_t mTail; uint32_t mCurBlockIdx; -+ -+ // power of 2 -+ static const uint32_t mBlockSizeShift = 6; -+ static const uint32_t mBlockSize = 1 << mBlockSizeShift; -+ -+ void initialize() -+ { -+ mLock = 0; -+ mHead = 0; -+ mTail = 0; -+ mNumEntries = 0; -+ mCurBlock = (T*)malloc(mBlockSize*sizeof(T)); -+ mBlocks.push_back(mCurBlock); -+ mCurBlockIdx = 0; -+ } -+ -+ void clear() -+ { -+ mHead = 0; -+ mTail = 0; -+ mCurBlock = mBlocks[0]; -+ mCurBlockIdx = 0; -+ -+ mNumEntries = 0; -+ _ReadWriteBarrier(); -+ mLock = 0; -+ } -+ -+ uint32_t getNumQueued() -+ { -+ return mNumEntries; -+ } -+ -+ bool tryLock() -+ { -+ if (mLock) -+ { -+ return false; -+ } -+ -+ // try to lock the FIFO -+ LONG initial = InterlockedCompareExchange(&mLock, 1, 0); -+ return (initial == 0); -+ } -+ -+ void unlock() -+ { -+ mLock = 0; -+ } -+ -+ T* peek() -+ { -+ if (mNumEntries == 0) -+ { -+ return nullptr; -+ } -+ uint32_t block = mHead >> mBlockSizeShift; -+ return &mBlocks[block][mHead & (mBlockSize-1)]; -+ } -+ -+ void dequeue_noinc() -+ { -+ mHead ++; -+ mNumEntries --; -+ } -+ -+ bool enqueue_try_nosync(const T* entry) -+ { -+ memcpy(&mCurBlock[mTail], entry, sizeof(T)); -+ -+ mTail ++; -+ if (mTail == mBlockSize) -+ { -+ if (++mCurBlockIdx < mBlocks.size()) -+ { -+ mCurBlock = mBlocks[mCurBlockIdx]; -+ } -+ else -+ { -+ T* newBlock = (T*)malloc(sizeof(T)*mBlockSize); -+ SWR_ASSERT(newBlock); -+ -+ mBlocks.push_back(newBlock); -+ mCurBlock = newBlock; -+ } -+ -+ mTail = 0; -+ } -+ -+ mNumEntries ++; -+ return true; -+ } -+ -+ void destroy() -+ { -+ for (uint32_t i = 0; i < mBlocks.size(); ++i) -+ { -+ free(mBlocks[i]); -+ } -+ } -+ -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h -new file mode 100644 -index 0000000..af57697 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h -@@ -0,0 +1,167 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file format_conversion.h -+* -+* @brief API implementation -+* -+******************************************************************************/ -+#include "format_types.h" -+#include "format_traits.h" -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Load SIMD packed pixels in SOA format and converts to -+/// SOA RGBA32_FLOAT format. -+/// @param pSrc - source data in SOA form -+/// @param dst - output data in SOA form -+template -+INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst) -+{ -+ // fast path for float32 -+ if ((FormatTraits::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits::GetBPC(0) == 32)) -+ { -+ auto lambda = [&](int comp) -+ { -+ simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp*sizeof(simdscalar))); -+ -+ dst.v[FormatTraits::swizzle(comp)] = vComp; -+ }; -+ -+ UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); -+ return; -+ } -+ -+ auto lambda = [&](int comp) -+ { -+ // load SIMD components -+ simdscalar vComp = FormatTraits::loadSOA(comp, pSrc); -+ -+ // unpack -+ vComp = FormatTraits::unpack(comp, vComp); -+ -+ // convert -+ if (FormatTraits::isNormalized(comp)) -+ { -+ vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp)); -+ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::toFloat(comp))); -+ } -+ -+ dst.v[FormatTraits::swizzle(comp)] = vComp; -+ -+ pSrc += (FormatTraits::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; -+ }; -+ -+ UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Convert and store simdvector of pixels in SOA -+/// RGBA32_FLOAT to SOA format -+/// @param src - source data in SOA form -+/// @param dst - output data in SOA form -+template -+INLINE void StoreSOA(const simdvector &src, BYTE *pDst) -+{ -+ // fast path for float32 -+ if ((FormatTraits::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits::GetBPC(0) == 32)) -+ { -+ for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) -+ { -+ simdscalar vComp = src.v[FormatTraits::swizzle(comp)]; -+ -+ // Gamma-correct -+ if (FormatTraits::isSRGB) -+ { -+ if (comp < 3) // Input format is always RGBA32_FLOAT. -+ { -+ vComp = FormatTraits::convertSrgb(comp, vComp); -+ } -+ } -+ -+ _simd_store_ps((float*)(pDst + comp*sizeof(simdscalar)), vComp); -+ } -+ return; -+ } -+ -+ auto lambda = [&](int comp) -+ { -+ simdscalar vComp = src.v[FormatTraits::swizzle(comp)]; -+ -+ // Gamma-correct -+ if (FormatTraits::isSRGB) -+ { -+ if (comp < 3) // Input format is always RGBA32_FLOAT. -+ { -+ vComp = FormatTraits::convertSrgb(comp, vComp); -+ } -+ } -+ -+ // convert -+ if (FormatTraits::isNormalized(comp)) -+ { -+ if (FormatTraits::GetType(comp) == SWR_TYPE_UNORM) -+ { -+ vComp = _simd_max_ps(vComp, _simd_setzero_ps()); -+ } -+ -+ if (FormatTraits::GetType(comp) == SWR_TYPE_SNORM) -+ { -+ vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f)); -+ } -+ vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f)); -+ -+ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::fromFloat(comp))); -+ vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); -+ } -+ else if (FormatTraits::GetBPC(comp) < 32) -+ { -+ if (FormatTraits::GetType(comp) == SWR_TYPE_UINT) -+ { -+ int iMax = (1 << FormatTraits::GetBPC(comp)) - 1; -+ int iMin = 0; -+ simdscalari vCompi = _simd_castps_si(vComp); -+ vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin)); -+ vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax)); -+ vComp = _simd_castsi_ps(vCompi); -+ } -+ else if (FormatTraits::GetType(comp) == SWR_TYPE_SINT) -+ { -+ int iMax = (1 << (FormatTraits::GetBPC(comp) - 1)) - 1; -+ int iMin = -1 - iMax; -+ simdscalari vCompi = _simd_castps_si(vComp); -+ vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin)); -+ vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax)); -+ vComp = _simd_castsi_ps(vCompi); -+ } -+ } -+ -+ // pack -+ vComp = FormatTraits::pack(comp, vComp); -+ -+ // store -+ FormatTraits::storeSOA(comp, pDst, vComp); -+ -+ pDst += (FormatTraits::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; -+ }; -+ -+ UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h -new file mode 100644 -index 0000000..d39f523 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h -@@ -0,0 +1,2954 @@ -+ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file format_traits.h -+* -+* @brief auto-generated file -+* -+* DO NOT EDIT -+* -+******************************************************************************/ -+ -+#pragma once -+ -+#include "format_types.h" -+#include "utils.h" -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatSwizzle - Component swizzle selects -+////////////////////////////////////////////////////////////////////////// -+template -+struct FormatSwizzle -+{ -+ // Return swizzle select for component. -+ INLINE static uint32_t swizzle(UINT c) -+ { -+ static const uint32_t s[4] = { comp0, comp1, comp2, comp3 }; -+ return s[c]; -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits -+////////////////////////////////////////////////////////////////////////// -+template -+struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0> -+{ -+ static const uint32_t bpp{ 0 }; -+ static const uint32_t numComps{ 0 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{1}; -+ static const uint32_t bcHeight{1}; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32A32_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32_32 TransposeT; -+ typedef Format4<32, 32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32A32_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32_32 TransposeT; -+ typedef Format4<32, 32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32A32_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32_32 TransposeT; -+ typedef Format4<32, 32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32X32_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32_32 TransposeT; -+ typedef Format4<32, 32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32A32_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32_32 TransposeT; -+ typedef Format4<32, 32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32A32_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32_32 TransposeT; -+ typedef Format4<32, 32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 96 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32 TransposeT; -+ typedef Format3<32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 96 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32 TransposeT; -+ typedef Format3<32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 96 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32 TransposeT; -+ typedef Format3<32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 96 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32 TransposeT; -+ typedef Format3<32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32B32_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 96 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32_32 TransposeT; -+ typedef Format3<32, 32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16A16_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16_16 TransposeT; -+ typedef Format4<16, 16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16A16_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16_16 TransposeT; -+ typedef Format4<16, 16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16A16_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16_16 TransposeT; -+ typedef Format4<16, 16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16A16_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16_16 TransposeT; -+ typedef Format4<16, 16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16A16_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16_16 TransposeT; -+ typedef Format4<16, 16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32 TransposeT; -+ typedef Format2<32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32 TransposeT; -+ typedef Format2<32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32 TransposeT; -+ typedef Format2<32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32_FLOAT_X8X24_TYPELESS -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32 TransposeT; -+ typedef Format2<32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16X16_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16_16 TransposeT; -+ typedef Format4<16, 16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16X16_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16_16 TransposeT; -+ typedef Format4<16, 16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16A16_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16_16 TransposeT; -+ typedef Format4<16, 16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16A16_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16_16 TransposeT; -+ typedef Format4<16, 16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32 TransposeT; -+ typedef Format2<32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32G32_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32 TransposeT; -+ typedef Format2<32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32_FLOAT_X8X24_TYPELESS_LD -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose32_32 TransposeT; -+ typedef Format2<32, 32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B8G8R8A8_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B8G8R8A8_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R10G10B10A2_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R10G10B10A2_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R10G10B10A2_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8A8_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8A8_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8A8_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8A8_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8A8_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16 TransposeT; -+ typedef Format2<16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16 TransposeT; -+ typedef Format2<16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16 TransposeT; -+ typedef Format2<16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16 TransposeT; -+ typedef Format2<16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16 TransposeT; -+ typedef Format2<16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B10G10R10A2_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B10G10R10A2_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R11G11B10_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose11_11_10 TransposeT; -+ typedef Format3<11, 11, 10> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<32> TransposeT; -+ typedef Format1<32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<32> TransposeT; -+ typedef Format1<32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<32> TransposeT; -+ typedef Format1<32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R24_UNORM_X8_TYPELESS -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<32> TransposeT; -+ typedef Format1<24> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R24_UNORM_X8_TYPELESS_LD -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<32> TransposeT; -+ typedef Format1<24> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for A32_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<32> TransposeT; -+ typedef Format1<32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B8G8R8X8_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B8G8R8X8_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8X8_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8X8_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R9G9B9E5_SHAREDEXP -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose9_9_9_5 TransposeT; -+ typedef Format4<9, 9, 9, 5> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B10G10R10X2_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R10G10B10X2_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8A8_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8A8_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16 TransposeT; -+ typedef Format2<16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16 TransposeT; -+ typedef Format2<16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<32> TransposeT; -+ typedef Format1<32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R32_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<32> TransposeT; -+ typedef Format1<32> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B5G6R5_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose5_6_5 TransposeT; -+ typedef Format3<5, 6, 5> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B5G6R5_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose5_6_5 TransposeT; -+ typedef Format3<5, 6, 5> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B5G5R5A1_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose5_5_5_1 TransposeT; -+ typedef Format4<5, 5, 5, 1> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B5G5R5A1_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose5_5_5_1 TransposeT; -+ typedef Format4<5, 5, 5, 1> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B4G4R4A4_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose4_4_4_4 TransposeT; -+ typedef Format4<4, 4, 4, 4> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B4G4R4A4_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose4_4_4_4 TransposeT; -+ typedef Format4<4, 4, 4, 4> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8 TransposeT; -+ typedef Format2<8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8 TransposeT; -+ typedef Format2<8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8 TransposeT; -+ typedef Format2<8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8 TransposeT; -+ typedef Format2<8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<16> TransposeT; -+ typedef Format1<16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<16> TransposeT; -+ typedef Format1<16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<16> TransposeT; -+ typedef Format1<16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<16> TransposeT; -+ typedef Format1<16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<16> TransposeT; -+ typedef Format1<16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for A16_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<16> TransposeT; -+ typedef Format1<16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for A16_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<16> TransposeT; -+ typedef Format1<16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B5G5R5X1_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose5_5_5_1 TransposeT; -+ typedef Format4<5, 5, 5, 1> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B5G5R5X1_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose5_5_5_1 TransposeT; -+ typedef Format4<5, 5, 5, 1> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8 TransposeT; -+ typedef Format2<8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 2 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8 TransposeT; -+ typedef Format2<8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<16> TransposeT; -+ typedef Format1<16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 16 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<16> TransposeT; -+ typedef Format1<16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 8 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<8> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 8 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<8> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 8 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<8> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 8 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<8> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for A8_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 8 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<8> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 8 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<8> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 8 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef TransposeSingleComponent<8> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for YCRCB_SWAPUVY -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ true }; -+ static const uint32_t bcWidth{ 2 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC1_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<64> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC2_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<128> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC3_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<128> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC4_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<64> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC5_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<128> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC1_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<64> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC2_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<128> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC3_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<128> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for YCRCB_SWAPUV -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ true }; -+ static const uint32_t bcWidth{ 2 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8_8 TransposeT; -+ typedef Format4<8, 8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 24 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8 TransposeT; -+ typedef Format3<8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 24 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8 TransposeT; -+ typedef Format3<8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 24 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8 TransposeT; -+ typedef Format3<8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 24 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8 TransposeT; -+ typedef Format3<8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC4_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 64 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<64> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC5_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<128> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16_FLOAT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 48 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16 TransposeT; -+ typedef Format3<16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 48 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16 TransposeT; -+ typedef Format3<16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 48 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16 TransposeT; -+ typedef Format3<16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 48 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16 TransposeT; -+ typedef Format3<16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 48 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16 TransposeT; -+ typedef Format3<16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC7_UNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<128> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for BC7_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 128 }; -+ static const uint32_t numComps{ 1 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ true }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 4 }; -+ static const uint32_t bcHeight{ 4 }; -+ -+ typedef TransposeSingleComponent<128> TransposeT; -+ typedef Format1<8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8_UNORM_SRGB -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 24 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ true }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8 TransposeT; -+ typedef Format3<8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 48 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16 TransposeT; -+ typedef Format3<16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R16G16B16_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 48 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose16_16_16 TransposeT; -+ typedef Format3<16, 16, 16> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R10G10B10A2_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R10G10B10A2_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R10G10B10A2_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R10G10B10A2_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B10G10R10A2_SNORM -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B10G10R10A2_USCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B10G10R10A2_SSCALED -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x3f800000> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B10G10R10A2_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for B10G10R10A2_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<2, 1, 0, 3>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 32 }; -+ static const uint32_t numComps{ 4 }; -+ static const bool hasAlpha{ true }; -+ static const uint32_t alphaComp{ 3 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose10_10_10_2 TransposeT; -+ typedef Format4<10, 10, 10, 2> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8_UINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 24 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8 TransposeT; -+ typedef Format3<8, 8, 8> FormatT; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// FormatTraits - Format traits specialization for R8G8B8_SINT -+////////////////////////////////////////////////////////////////////////// -+template<> struct FormatTraits : -+ ComponentTraits, -+ FormatSwizzle<0, 1, 2>, -+ Defaults<0, 0, 0, 0x1> -+{ -+ static const uint32_t bpp{ 24 }; -+ static const uint32_t numComps{ 3 }; -+ static const bool hasAlpha{ false }; -+ static const uint32_t alphaComp{ 0 }; -+ static const bool isSRGB{ false }; -+ static const bool isBC{ false }; -+ static const bool isSubsampled{ false }; -+ static const uint32_t bcWidth{ 1 }; -+ static const uint32_t bcHeight{ 1 }; -+ -+ typedef Transpose8_8_8 TransposeT; -+ typedef Format3<8, 8, 8> FormatT; -+}; -+ -diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h -new file mode 100644 -index 0000000..92125df ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h -@@ -0,0 +1,1053 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file formats.h -+* -+* @brief Definitions for SWR_FORMAT functions. -+* -+******************************************************************************/ -+#pragma once -+ -+////////////////////////////////////////////////////////////////////////// -+/// PackTraits - Helpers for packing / unpacking same pixel sizes -+////////////////////////////////////////////////////////////////////////// -+template -+struct PackTraits -+{ -+ static const uint32_t MyNumBits = NumBits; -+ static simdscalar loadSOA(const BYTE *pSrc) = delete; -+ static void storeSOA(BYTE *pDst, simdscalar src) = delete; -+ static simdscalar unpack(simdscalar &in) = delete; -+ static simdscalar pack(simdscalar &in) = delete; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// PackTraits - Helpers for packing / unpacking unused channels -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct PackTraits<0, false> -+{ -+ static const uint32_t MyNumBits = 0; -+ -+ static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); } -+ static void storeSOA(BYTE *pDst, simdscalar src) { return; } -+ static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); } -+ static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); } -+}; -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct PackTraits<8, false> -+{ -+ static const uint32_t MyNumBits = 8; -+ -+ static simdscalar loadSOA(const BYTE *pSrc) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ __m256 result = _mm256_setzero_ps(); -+ __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); -+ return _mm256_insertf128_ps(result, vLo, 0); -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static void storeSOA(BYTE *pDst, simdscalar src) -+ { -+ // store simd bytes -+#if KNOB_SIMD_WIDTH == 8 -+ _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static simdscalar unpack(simdscalar &in) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+#if KNOB_ARCH==KNOB_ARCH_AVX -+ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); -+ __m128i resLo = _mm_cvtepu8_epi32(src); -+ __m128i resHi = _mm_shuffle_epi8(src, -+ _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); -+ -+ __m256i result = _mm256_castsi128_si256(resLo); -+ result = _mm256_insertf128_si256(result, resHi, 1); -+ return _mm256_castsi256_ps(result); -+#elif KNOB_ARCH==KNOB_ARCH_AVX2 -+ return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); -+#endif -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static simdscalar pack(simdscalar &in) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ simdscalari src = _simd_castps_si(in); -+ __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); -+ __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128()); -+ return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// PackTraits - Helpers for packing / unpacking 8 bit signed channels -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct PackTraits<8, true> -+{ -+ static const uint32_t MyNumBits = 8; -+ -+ static simdscalar loadSOA(const BYTE *pSrc) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ __m256 result = _mm256_setzero_ps(); -+ __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); -+ return _mm256_insertf128_ps(result, vLo, 0); -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static void storeSOA(BYTE *pDst, simdscalar src) -+ { -+ // store simd bytes -+#if KNOB_SIMD_WIDTH == 8 -+ _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static simdscalar unpack(simdscalar &in) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+#if KNOB_ARCH==KNOB_ARCH_AVX -+ SWR_ASSERT(0); // I think this may be incorrect. -+ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); -+ __m128i resLo = _mm_cvtepi8_epi32(src); -+ __m128i resHi = _mm_shuffle_epi8(src, -+ _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); -+ -+ __m256i result = _mm256_castsi128_si256(resLo); -+ result = _mm256_insertf128_si256(result, resHi, 1); -+ return _mm256_castsi256_ps(result); -+#elif KNOB_ARCH==KNOB_ARCH_AVX2 -+ return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); -+#endif -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static simdscalar pack(simdscalar &in) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ simdscalari src = _simd_castps_si(in); -+ __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); -+ __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128()); -+ return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct PackTraits<16, false> -+{ -+ static const uint32_t MyNumBits = 16; -+ -+ static simdscalar loadSOA(const BYTE *pSrc) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ __m256 result = _mm256_setzero_ps(); -+ __m128 vLo = _mm_load_ps((const float*)pSrc); -+ return _mm256_insertf128_ps(result, vLo, 0); -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static void storeSOA(BYTE *pDst, simdscalar src) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ // store 16B (2B * 8) -+ _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static simdscalar unpack(simdscalar &in) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+#if KNOB_ARCH==KNOB_ARCH_AVX -+ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); -+ __m128i resLo = _mm_cvtepu16_epi32(src); -+ __m128i resHi = _mm_shuffle_epi8(src, -+ _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); -+ -+ __m256i result = _mm256_castsi128_si256(resLo); -+ result = _mm256_insertf128_si256(result, resHi, 1); -+ return _mm256_castsi256_ps(result); -+#elif KNOB_ARCH==KNOB_ARCH_AVX2 -+ return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); -+#endif -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static simdscalar pack(simdscalar &in) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ simdscalari src = _simd_castps_si(in); -+ __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); -+ return _mm256_castsi256_ps(res); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// PackTraits - Helpers for packing / unpacking 16 bit signed channels -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct PackTraits<16, true> -+{ -+ static const uint32_t MyNumBits = 16; -+ -+ static simdscalar loadSOA(const BYTE *pSrc) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ __m256 result = _mm256_setzero_ps(); -+ __m128 vLo = _mm_load_ps((const float*)pSrc); -+ return _mm256_insertf128_ps(result, vLo, 0); -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static void storeSOA(BYTE *pDst, simdscalar src) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ // store 16B (2B * 8) -+ _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static simdscalar unpack(simdscalar &in) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+#if KNOB_ARCH==KNOB_ARCH_AVX -+ SWR_ASSERT(0); // I think this is incorrectly implemented -+ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); -+ __m128i resLo = _mm_cvtepi16_epi32(src); -+ __m128i resHi = _mm_shuffle_epi8(src, -+ _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); -+ -+ __m256i result = _mm256_castsi128_si256(resLo); -+ result = _mm256_insertf128_si256(result, resHi, 1); -+ return _mm256_castsi256_ps(result); -+#elif KNOB_ARCH==KNOB_ARCH_AVX2 -+ return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); -+#endif -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static simdscalar pack(simdscalar &in) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ simdscalari src = _simd_castps_si(in); -+ __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); -+ return _mm256_castsi256_ps(res); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// PackTraits - Helpers for packing / unpacking 32 bit channels -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct PackTraits<32, false> -+{ -+ static const uint32_t MyNumBits = 32; -+ -+ static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); } -+ static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); } -+ static simdscalar unpack(simdscalar &in) { return in; } -+ static simdscalar pack(simdscalar &in) { return in; } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits. -+////////////////////////////////////////////////////////////////////////// -+template -+struct TypeTraits : PackTraits -+{ -+ static const SWR_TYPE MyType = type; -+ static float toFloat() { return 0.0; } -+ static float fromFloat() { SWR_ASSERT(0); return 0.0; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for UINT8 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<8> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_UINT; -+ static float toFloat() { return 0.0; } -+ static float fromFloat() { SWR_ASSERT(0); return 0.0; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for UINT8 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<8, true> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_SINT; -+ static float toFloat() { return 0.0; } -+ static float fromFloat() { SWR_ASSERT(0); return 0.0; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for UINT16 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<16> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_UINT; -+ static float toFloat() { return 0.0; } -+ static float fromFloat() { SWR_ASSERT(0); return 0.0; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for SINT16 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<16, true> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_SINT; -+ static float toFloat() { return 0.0; } -+ static float fromFloat() { SWR_ASSERT(0); return 0.0; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for UINT32 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<32> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_UINT; -+ static float toFloat() { return 0.0; } -+ static float fromFloat() { SWR_ASSERT(0); return 0.0; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for UINT32 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<32> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_SINT; -+ static float toFloat() { return 0.0; } -+ static float fromFloat() { SWR_ASSERT(0); return 0.0; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for UNORM8 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<8> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_UNORM; -+ static float toFloat() { return 1.0f / 255.0f; } -+ static float fromFloat() { return 255.0f; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for UNORM8 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<8, true> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_SNORM; -+ static float toFloat() { return 1.0f / 127.0f; } -+ static float fromFloat() { return 127.0f; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for UNORM16 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<16> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_UNORM; -+ static float toFloat() { return 1.0f / 65535.0f; } -+ static float fromFloat() { return 65535.0f; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for SNORM16 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<16, true> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_UNORM; -+ static float toFloat() { return 1.0f / 32767.0f; } -+ static float fromFloat() { return 32767.0f; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for UNORM24 -+////////////////////////////////////////////////////////////////////////// -+template<> -+struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_UNORM; -+ static float toFloat() { return 1.0f / 16777215.0f; } -+ static float fromFloat() { return 16777215.0f; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+// FLOAT Specializations from here on... -+////////////////////////////////////////////////////////////////////////// -+#define TO_M128i(a) _mm_castps_si128(a) -+#define TO_M128(a) _mm_castsi128_ps(a) -+ -+#include "math.h" -+ -+template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden > -+inline static __m128 fastpow(__m128 arg) { -+ __m128 ret = arg; -+ -+ static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) -+ * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum)); -+ -+ // Apply a constant pre-correction factor. -+ ret = _mm_mul_ps(ret, factor); -+ -+ // Reinterpret arg as integer to obtain logarithm. -+ //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret)); -+ ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); -+ -+ // Multiply logarithm by power. -+ ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden)); -+ -+ // Convert back to "integer" to exponentiate. -+ //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret)); -+ ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); -+ -+ return ret; -+} -+ -+inline static __m128 pow512_4(__m128 arg) { -+ // 5/12 is too small, so compute the 4th root of 20/12 instead. -+ // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. -+ // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 -+ __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg); -+ __m128 xover = _mm_mul_ps(arg, xf); -+ -+ __m128 xfm1 = _mm_rsqrt_ps(xf); -+ __m128 x2 = _mm_mul_ps(arg, arg); -+ __m128 xunder = _mm_mul_ps(x2, xfm1); -+ -+ // sqrt2 * over + 2 * sqrt2 * under -+ __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), -+ _mm_add_ps(xover, xunder)); -+ -+ xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); -+ xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); -+ return xavg; -+} -+ -+inline static __m128 powf_wrapper(__m128 Base, float Exp) -+{ -+ float *f = (float *)(&Base); -+ -+ return _mm_set_ps(powf(f[0], Exp), -+ powf(f[1], Exp), -+ powf(f[2], Exp), -+ powf(f[3], Exp)); -+} -+ -+static inline __m128 ConvertFloatToSRGB2(__m128& Src) -+{ -+ // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value -+ __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src)); -+ -+ // squeeze the mask down to 16 bits (4 bits per DWORD) -+ int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask); -+ -+ __m128 Result; -+ -+ // -+ if (CompareResult == 0xFFFF) -+ { -+ // all DWORDs are <= the threshold -+ Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); -+ } -+ else if (CompareResult == 0x0) -+ { -+ // all DWORDs are > the threshold -+ __m128 fSrc_0RGB = Src; -+ -+ // --> 1.055f * c(1.0f/2.4f) - 0.055f -+#if KNOB_USE_FAST_SRGB == TRUE -+ // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. -+ __m128 f = pow512_4(fSrc_0RGB); -+#else -+ __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); -+#endif -+ f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); -+ Result = _mm_sub_ps(f, _mm_set1_ps(0.055f)); -+ } -+ else -+ { -+ // some DWORDs are <= the threshold and some are > threshold -+ __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); -+ -+ __m128 fSrc_0RGB = Src; -+ -+ // --> 1.055f * c(1.0f/2.4f) - 0.055f -+#if KNOB_USE_FAST_SRGB == TRUE -+ // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. -+ __m128 f = pow512_4(fSrc_0RGB); -+#else -+ __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); -+#endif -+ f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); -+ f = _mm_sub_ps(f, _mm_set1_ps(0.055f)); -+ -+ // Clear the alpha (is garbage after the sub) -+ __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)); -+ -+ __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm)); -+ __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i); -+ __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart); -+ -+ Result = TO_M128(CombinedParts); -+ } -+ -+ return Result; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for FLOAT16 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<16> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_FLOAT; -+ static float toFloat() { return 1.0f; } -+ static float fromFloat() { return 1.0f; } -+ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } -+ -+ static simdscalar pack(const simdscalar &in) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+#if (KNOB_ARCH == KNOB_ARCH_AVX) -+ // input is 8 packed float32, output is 8 packed float16 -+ simdscalari src = _simd_castps_si(in); -+ -+ static const uint32_t FLOAT_EXP_BITS = 8; -+ static const uint32_t FLOAT_MANTISSA_BITS = 23; -+ static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1; -+ static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS; -+ -+ static const uint32_t HALF_EXP_BITS = 5; -+ static const uint32_t HALF_MANTISSA_BITS = 10; -+ static const uint32_t HALF_MANTISSA_MASK = (1U << HALF_MANTISSA_BITS) - 1; -+ static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS; -+ -+ // minimum exponent required, exponents below this are flushed to 0. -+ static const int32_t HALF_EXP_MIN = -14; -+ static const int32_t FLOAT_EXP_BIAS = 127; -+ static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS; -+ static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand -+ -+ // maximum exponent required, exponents above this are set to infinity -+ static const int32_t HALF_EXP_MAX = 15; -+ static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS; -+ -+ const simdscalari vSignMask = _simd_set1_epi32(0x80000000); -+ const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK); -+ const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK); -+ const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS)); -+ const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS)); -+ const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS)); -+ -+ simdscalari vSign = _simd_and_si(src, vSignMask); -+ simdscalari vExp = _simd_and_si(src, vExpMask); -+ simdscalari vMan = _simd_and_si(src, vManMask); -+ -+ simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz); -+ simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin)); -+ simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp); -+ simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp)); -+ -+ simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS)); -+ -+ // pack output 16-bits into the lower 16-bits of each 32-bit channel -+ simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK)); -+ vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); -+ -+ // Flush To Zero -+ vDst = _simd_andnot_si(vFTZMask, vDst); -+ // Apply Infinites / NaN -+ vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK))); -+ -+ // Apply clamps -+ vDst = _simd_andnot_si(vClampMask, vDst); -+ vDst = _simd_or_si(vDst, -+ _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF))); -+ -+ // Compute Denormals (subnormals) -+ if (!_mm256_testz_si256(vDenormMask, vDenormMask)) -+ { -+ uint32_t *pDenormMask = (uint32_t*)&vDenormMask; -+ uint32_t *pExp = (uint32_t*)&vExp; -+ uint32_t *pMan = (uint32_t*)&vMan; -+ uint32_t *pDst = (uint32_t*)&vDst; -+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) -+ { -+ if (pDenormMask[i]) -+ { -+ // Need to compute subnormal value -+ uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS; -+ uint32_t mantissa = pMan[i] | -+ (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit -+ -+ pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); -+ } -+ } -+ } -+ -+ // Add in sign bits -+ vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16)); -+ -+ // Pack to lower 128-bits -+ vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1))); -+ -+#if 0 -+#if !defined(NDEBUG) -+ simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)); -+ -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]); -+ } -+#endif -+#endif -+ -+ return _simd_castsi_ps(vDst); -+ -+#else -+ return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC))); -+#endif -+#else -+#error Unsupported vector width -+#endif -+ } -+ -+ static simdscalar unpack(const simdscalar &in) -+ { -+ // input is 8 packed float16, output is 8 packed float32 -+ SWR_ASSERT(0); // @todo -+ return _simd_setzero_ps(); -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// TypeTraits - Format type traits specialization for FLOAT32 -+////////////////////////////////////////////////////////////////////////// -+template<> struct TypeTraits : PackTraits<32> -+{ -+ static const SWR_TYPE MyType = SWR_TYPE_FLOAT; -+ static float toFloat() { return 1.0f; } -+ static float fromFloat() { return 1.0f; } -+ static inline simdscalar convertSrgb(simdscalar &in) -+ { -+#if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2) -+ __m128 srcLo = _mm256_extractf128_ps(in, 0); -+ __m128 srcHi = _mm256_extractf128_ps(in, 1); -+ -+ srcLo = ConvertFloatToSRGB2(srcLo); -+ srcHi = ConvertFloatToSRGB2(srcHi); -+ -+ in = _mm256_insertf128_ps(in, srcLo, 0); -+ in = _mm256_insertf128_ps(in, srcHi, 1); -+ -+#endif -+ return in; -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format1 - Bitfield for single component formats. -+////////////////////////////////////////////////////////////////////////// -+template -+struct Format1 -+{ -+ union -+ { -+ uint32_t r : x; -+ -+ ///@ The following are here to provide full template needed in Formats. -+ uint32_t g : x; -+ uint32_t b : x; -+ uint32_t a : x; -+ }; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format1 - Bitfield for single component formats - 8 bit specialization -+////////////////////////////////////////////////////////////////////////// -+template<> -+struct Format1<8> -+{ -+ union -+ { -+ uint8_t r; -+ -+ ///@ The following are here to provide full template needed in Formats. -+ uint8_t g; -+ uint8_t b; -+ uint8_t a; -+ }; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format1 - Bitfield for single component formats - 16 bit specialization -+////////////////////////////////////////////////////////////////////////// -+template<> -+struct Format1<16> -+{ -+ union -+ { -+ uint16_t r; -+ -+ ///@ The following are here to provide full template needed in Formats. -+ uint16_t g; -+ uint16_t b; -+ uint16_t a; -+ }; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format2 - Bitfield for 2 component formats. -+////////////////////////////////////////////////////////////////////////// -+template -+union Format2 -+{ -+ struct -+ { -+ uint32_t r : x; -+ uint32_t g : y; -+ }; -+ struct -+ { -+ ///@ The following are here to provide full template needed in Formats. -+ uint32_t b : x; -+ uint32_t a : y; -+ }; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format2 - Bitfield for 2 component formats - 16 bit specialization -+////////////////////////////////////////////////////////////////////////// -+template<> -+union Format2<8,8> -+{ -+ struct -+ { -+ uint16_t r : 8; -+ uint16_t g : 8; -+ }; -+ struct -+ { -+ ///@ The following are here to provide full template needed in Formats. -+ uint16_t b : 8; -+ uint16_t a : 8; -+ }; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format3 - Bitfield for 3 component formats. -+////////////////////////////////////////////////////////////////////////// -+template -+union Format3 -+{ -+ struct -+ { -+ uint32_t r : x; -+ uint32_t g : y; -+ uint32_t b : z; -+ }; -+ uint32_t a; ///@note This is here to provide full template needed in Formats. -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format3 - Bitfield for 3 component formats - 16 bit specialization -+////////////////////////////////////////////////////////////////////////// -+template<> -+union Format3<5,6,5> -+{ -+ struct -+ { -+ uint16_t r : 5; -+ uint16_t g : 6; -+ uint16_t b : 5; -+ }; -+ uint16_t a; ///@note This is here to provide full template needed in Formats. -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format4 - Bitfield for 4 component formats. -+////////////////////////////////////////////////////////////////////////// -+template -+struct Format4 -+{ -+ uint32_t r : x; -+ uint32_t g : y; -+ uint32_t b : z; -+ uint32_t a : w; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format4 - Bitfield for 4 component formats - 16 bit specialization -+////////////////////////////////////////////////////////////////////////// -+template<> -+struct Format4<5,5,5,1> -+{ -+ uint16_t r : 5; -+ uint16_t g : 5; -+ uint16_t b : 5; -+ uint16_t a : 1; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Format4 - Bitfield for 4 component formats - 16 bit specialization -+////////////////////////////////////////////////////////////////////////// -+template<> -+struct Format4<4,4,4,4> -+{ -+ uint16_t r : 4; -+ uint16_t g : 4; -+ uint16_t b : 4; -+ uint16_t a : 4; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// ComponentTraits - Default components -+////////////////////////////////////////////////////////////////////////// -+template -+struct Defaults -+{ -+ INLINE static uint32_t GetDefault(uint32_t comp) -+ { -+ static const uint32_t defaults[4]{ x, y, z, w }; -+ return defaults[comp]; -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// ComponentTraits - Component type traits. -+////////////////////////////////////////////////////////////////////////// -+template -+struct ComponentTraits -+{ -+ INLINE static SWR_TYPE GetType(uint32_t comp) -+ { -+ static const SWR_TYPE CompType[4]{ X, Y, Z, W }; -+ return CompType[comp]; -+ } -+ -+ INLINE static uint32_t GetBPC(uint32_t comp) -+ { -+ static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW }; -+ return MyBpc[comp]; -+ } -+ -+ INLINE static bool isNormalized(uint32_t comp) -+ { -+ switch (comp) -+ { -+ case 0: -+ return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false; -+ case 1: -+ return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false; -+ case 2: -+ return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false; -+ case 3: -+ return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false; -+ } -+ SWR_ASSERT(0); -+ return false; -+ } -+ -+ INLINE static float toFloat(uint32_t comp) -+ { -+ switch (comp) -+ { -+ case 0: -+ return TypeTraits::toFloat(); -+ case 1: -+ return TypeTraits::toFloat(); -+ case 2: -+ return TypeTraits::toFloat(); -+ case 3: -+ return TypeTraits::toFloat(); -+ } -+ SWR_ASSERT(0); -+ return TypeTraits::toFloat(); -+ -+ } -+ -+ INLINE static float fromFloat(uint32_t comp) -+ { -+ switch (comp) -+ { -+ case 0: -+ return TypeTraits::fromFloat(); -+ case 1: -+ return TypeTraits::fromFloat(); -+ case 2: -+ return TypeTraits::fromFloat(); -+ case 3: -+ return TypeTraits::fromFloat(); -+ } -+ SWR_ASSERT(0); -+ return TypeTraits::fromFloat(); -+ } -+ -+ INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc) -+ { -+ switch (comp) -+ { -+ case 0: -+ return TypeTraits::loadSOA(pSrc); -+ case 1: -+ return TypeTraits::loadSOA(pSrc); -+ case 2: -+ return TypeTraits::loadSOA(pSrc); -+ case 3: -+ return TypeTraits::loadSOA(pSrc); -+ } -+ SWR_ASSERT(0); -+ return TypeTraits::loadSOA(pSrc); -+ } -+ -+ INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src) -+ { -+ switch (comp) -+ { -+ case 0: -+ TypeTraits::storeSOA(pDst, src); -+ return; -+ case 1: -+ TypeTraits::storeSOA(pDst, src); -+ return; -+ case 2: -+ TypeTraits::storeSOA(pDst, src); -+ return; -+ case 3: -+ TypeTraits::storeSOA(pDst, src); -+ return; -+ } -+ SWR_ASSERT(0); -+ TypeTraits::storeSOA(pDst, src); -+ } -+ -+ INLINE static simdscalar unpack(uint32_t comp, simdscalar &in) -+ { -+ switch (comp) -+ { -+ case 0: -+ return TypeTraits::unpack(in); -+ case 1: -+ return TypeTraits::unpack(in); -+ case 2: -+ return TypeTraits::unpack(in); -+ case 3: -+ return TypeTraits::unpack(in); -+ } -+ SWR_ASSERT(0); -+ return TypeTraits::unpack(in); -+ } -+ -+ INLINE static simdscalar pack(uint32_t comp, simdscalar &in) -+ { -+ switch (comp) -+ { -+ case 0: -+ return TypeTraits::pack(in); -+ case 1: -+ return TypeTraits::pack(in); -+ case 2: -+ return TypeTraits::pack(in); -+ case 3: -+ return TypeTraits::pack(in); -+ } -+ SWR_ASSERT(0); -+ return TypeTraits::pack(in); -+ } -+ -+ INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in) -+ { -+ switch (comp) -+ { -+ case 0: -+ return TypeTraits::convertSrgb(in);; -+ case 1: -+ return TypeTraits::convertSrgb(in);; -+ case 2: -+ return TypeTraits::convertSrgb(in);; -+ case 3: -+ return TypeTraits::convertSrgb(in);; -+ } -+ SWR_ASSERT(0); -+ return TypeTraits::convertSrgb(in); -+ } -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp -new file mode 100644 -index 0000000..986e49f ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp -@@ -0,0 +1,1972 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file frontend.cpp -+* -+* @brief Implementation for Frontend which handles vertex processing, -+* primitive assembly, clipping, binning, etc. -+* -+******************************************************************************/ -+ -+#include "api.h" -+#include "frontend.h" -+#include "backend.h" -+#include "context.h" -+#include "rdtsc_core.h" -+#include "rasterizer.h" -+#include "utils.h" -+#include "threads.h" -+#include "pa.h" -+#include "clip.h" -+#include "tilemgr.h" -+#include "tessellator.h" -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Helper macro to generate a bitmask -+static INLINE uint32_t GenMask(uint32_t numBits) -+{ -+ SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__); -+ return ((1U << numBits) - 1); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief FE handler for SwrSync. -+/// @param pContext - pointer to SWR context. -+/// @param pDC - pointer to draw context. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param pUserData - Pointer to user data passed back to sync callback. -+/// @todo This should go away when we switch this to use compute threading. -+void ProcessSync( -+ SWR_CONTEXT *pContext, -+ DRAW_CONTEXT *pDC, -+ uint32_t workerId, -+ void *pUserData) -+{ -+ SYNC_DESC *pSync = (SYNC_DESC*)pUserData; -+ BE_WORK work; -+ work.type = SYNC; -+ work.pfnWork = ProcessSyncBE; -+ work.desc.sync = *pSync; -+ -+ MacroTileMgr *pTileMgr = pDC->pTileMgr; -+ pTileMgr->enqueue(0, 0, &work); -+ -+ _ReadWriteBarrier(); -+ pDC->doneFE = true; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief FE handler for SwrGetStats. -+/// @param pContext - pointer to SWR context. -+/// @param pDC - pointer to draw context. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param pUserData - Pointer to user data passed back to stats callback. -+/// @todo This should go away when we switch this to use compute threading. -+void ProcessQueryStats( -+ SWR_CONTEXT *pContext, -+ DRAW_CONTEXT *pDC, -+ uint32_t workerId, -+ void *pUserData) -+{ -+ QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData; -+ BE_WORK work; -+ work.type = QUERYSTATS; -+ work.pfnWork = ProcessQueryStatsBE; -+ work.desc.queryStats = *pQueryStats; -+ -+ MacroTileMgr *pTileMgr = pDC->pTileMgr; -+ pTileMgr->enqueue(0, 0, &work); -+ -+ _ReadWriteBarrier(); -+ pDC->doneFE = true; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief FE handler for SwrClearRenderTarget. -+/// @param pContext - pointer to SWR context. -+/// @param pDC - pointer to draw context. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param pUserData - Pointer to user data passed back to clear callback. -+/// @todo This should go away when we switch this to use compute threading. -+void ProcessClear( -+ SWR_CONTEXT *pContext, -+ DRAW_CONTEXT *pDC, -+ uint32_t workerId, -+ void *pUserData) -+{ -+ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; -+ MacroTileMgr *pTileMgr = pDC->pTileMgr; -+ -+ const API_STATE& state = GetApiState(pDC); -+ -+ // queue a clear to each macro tile -+ // compute macro tile bounds for the current scissor/viewport -+ uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED; -+ uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED; -+ uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED; -+ uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED; -+ -+ BE_WORK work; -+ work.type = CLEAR; -+ work.pfnWork = ProcessClearBE; -+ work.desc.clear = *pClear; -+ -+ for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y) -+ { -+ for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x) -+ { -+ pTileMgr->enqueue(x, y, &work); -+ } -+ } -+ -+ _ReadWriteBarrier(); -+ pDC->doneFE = true; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief FE handler for SwrStoreTiles. -+/// @param pContext - pointer to SWR context. -+/// @param pDC - pointer to draw context. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param pUserData - Pointer to user data passed back to callback. -+/// @todo This should go away when we switch this to use compute threading. -+void ProcessStoreTiles( -+ SWR_CONTEXT *pContext, -+ DRAW_CONTEXT *pDC, -+ uint32_t workerId, -+ void *pUserData) -+{ -+ RDTSC_START(FEProcessStoreTiles); -+ STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData; -+ MacroTileMgr *pTileMgr = pDC->pTileMgr; -+ -+ const API_STATE& state = GetApiState(pDC); -+ -+ // queue a store to each macro tile -+ // compute macro tile bounds for the current render target -+ const uint32_t macroWidth = KNOB_MACROTILE_X_DIM; -+ const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM; -+ -+ uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth; -+ uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight; -+ -+ // store tiles -+ BE_WORK work; -+ work.type = STORETILES; -+ work.pfnWork = ProcessStoreTileBE; -+ work.desc.storeTiles = *pStore; -+ -+ for (uint32_t x = 0; x < numMacroTilesX; ++x) -+ { -+ for (uint32_t y = 0; y < numMacroTilesY; ++y) -+ { -+ pTileMgr->enqueue(x, y, &work); -+ } -+ } -+ -+ _ReadWriteBarrier(); -+ pDC->doneFE = true; -+ -+ RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief FE handler for SwrInvalidateTiles. -+/// @param pContext - pointer to SWR context. -+/// @param pDC - pointer to draw context. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param pUserData - Pointer to user data passed back to callback. -+/// @todo This should go away when we switch this to use compute threading. -+void ProcessInvalidateTiles( -+ SWR_CONTEXT *pContext, -+ DRAW_CONTEXT *pDC, -+ uint32_t workerId, -+ void *pUserData) -+{ -+ RDTSC_START(FEProcessInvalidateTiles); -+ INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData; -+ MacroTileMgr *pTileMgr = pDC->pTileMgr; -+ -+ const API_STATE& state = GetApiState(pDC); -+ -+ // queue a store to each macro tile -+ // compute macro tile bounds for the current render target -+ uint32_t macroWidth = KNOB_MACROTILE_X_DIM; -+ uint32_t macroHeight = KNOB_MACROTILE_Y_DIM; -+ -+ uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth; -+ uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight; -+ -+ // load tiles -+ BE_WORK work; -+ work.type = INVALIDATETILES; -+ work.pfnWork = ProcessInvalidateTilesBE; -+ work.desc.invalidateTiles = *pInv; -+ -+ for (uint32_t x = 0; x < numMacroTilesX; ++x) -+ { -+ for (uint32_t y = 0; y < numMacroTilesY; ++y) -+ { -+ pTileMgr->enqueue(x, y, &work); -+ } -+ } -+ -+ _ReadWriteBarrier(); -+ pDC->doneFE = true; -+ -+ RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes the number of primitives given the number of verts. -+/// @param mode - primitive topology for draw operation. -+/// @param numElements - number of vertices or indices for draw. -+/// @todo Frontend needs to be refactored. This will go in appropriate place then. -+uint32_t GetNumPrims( -+ PRIMITIVE_TOPOLOGY mode, -+ uint32_t numElements) -+{ -+ switch (mode) -+ { -+ case TOP_POINT_LIST: return numElements; -+ case TOP_TRIANGLE_LIST: return numElements / 3; -+ case TOP_TRIANGLE_STRIP: return numElements < 3 ? 0 : numElements - 2; -+ case TOP_TRIANGLE_FAN: return numElements < 3 ? 0 : numElements - 2; -+ case TOP_TRIANGLE_DISC: return numElements < 2 ? 0 : numElements - 1; -+ case TOP_QUAD_LIST: return numElements / 4; -+ case TOP_QUAD_STRIP: return numElements < 4 ? 0 : (numElements - 2) / 2; -+ case TOP_LINE_STRIP: return numElements < 2 ? 0 : numElements - 1; -+ case TOP_LINE_LIST: return numElements / 2; -+ case TOP_LINE_LOOP: return numElements; -+ case TOP_RECT_LIST: return numElements / 3; -+ -+ case TOP_PATCHLIST_1: -+ case TOP_PATCHLIST_2: -+ case TOP_PATCHLIST_3: -+ case TOP_PATCHLIST_4: -+ case TOP_PATCHLIST_5: -+ case TOP_PATCHLIST_6: -+ case TOP_PATCHLIST_7: -+ case TOP_PATCHLIST_8: -+ case TOP_PATCHLIST_9: -+ case TOP_PATCHLIST_10: -+ case TOP_PATCHLIST_11: -+ case TOP_PATCHLIST_12: -+ case TOP_PATCHLIST_13: -+ case TOP_PATCHLIST_14: -+ case TOP_PATCHLIST_15: -+ case TOP_PATCHLIST_16: -+ case TOP_PATCHLIST_17: -+ case TOP_PATCHLIST_18: -+ case TOP_PATCHLIST_19: -+ case TOP_PATCHLIST_20: -+ case TOP_PATCHLIST_21: -+ case TOP_PATCHLIST_22: -+ case TOP_PATCHLIST_23: -+ case TOP_PATCHLIST_24: -+ case TOP_PATCHLIST_25: -+ case TOP_PATCHLIST_26: -+ case TOP_PATCHLIST_27: -+ case TOP_PATCHLIST_28: -+ case TOP_PATCHLIST_29: -+ case TOP_PATCHLIST_30: -+ case TOP_PATCHLIST_31: -+ case TOP_PATCHLIST_32: -+ return numElements / (mode - TOP_PATCHLIST_BASE); -+ -+ case TOP_LINE_LIST_ADJ: -+ case TOP_LISTSTRIP_ADJ: -+ case TOP_TRI_LIST_ADJ: -+ case TOP_TRI_STRIP_ADJ: -+ case TOP_TRI_STRIP_REVERSE: -+ case TOP_POLYGON: -+ case TOP_POINT_LIST_BF: -+ case TOP_LINE_STRIP_CONT: -+ case TOP_LINE_STRIP_BF: -+ case TOP_LINE_STRIP_CONT_BF: -+ case TOP_TRIANGLE_FAN_NOSTIPPLE: -+ case TOP_PATCHLIST_BASE: -+ case TOP_UNKNOWN: -+ SWR_ASSERT(false, "Unsupported topology: %d", mode); -+ return 0; -+ } -+ -+ return 0; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Return number of verts per primitive. -+/// @param topology - topology -+/// @param includeAdjVerts - include adjacent verts in primitive vertices -+INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts) -+{ -+ uint32_t numVerts = 0; -+ switch (topology) -+ { -+ case TOP_POINT_LIST: -+ case TOP_POINT_LIST_BF: -+ numVerts = 1; -+ break; -+ case TOP_LINE_LIST: -+ case TOP_LINE_STRIP: -+ case TOP_LINE_LIST_ADJ: -+ case TOP_LINE_LOOP: -+ case TOP_LINE_STRIP_CONT: -+ case TOP_LINE_STRIP_BF: -+ case TOP_LISTSTRIP_ADJ: -+ numVerts = 2; -+ break; -+ case TOP_TRIANGLE_LIST: -+ case TOP_TRIANGLE_STRIP: -+ case TOP_TRIANGLE_FAN: -+ case TOP_TRI_LIST_ADJ: -+ case TOP_TRI_STRIP_ADJ: -+ case TOP_TRI_STRIP_REVERSE: -+ case TOP_RECT_LIST: -+ numVerts = 3; -+ break; -+ case TOP_QUAD_LIST: -+ case TOP_QUAD_STRIP: -+ numVerts = 4; -+ break; -+ case TOP_PATCHLIST_1: -+ case TOP_PATCHLIST_2: -+ case TOP_PATCHLIST_3: -+ case TOP_PATCHLIST_4: -+ case TOP_PATCHLIST_5: -+ case TOP_PATCHLIST_6: -+ case TOP_PATCHLIST_7: -+ case TOP_PATCHLIST_8: -+ case TOP_PATCHLIST_9: -+ case TOP_PATCHLIST_10: -+ case TOP_PATCHLIST_11: -+ case TOP_PATCHLIST_12: -+ case TOP_PATCHLIST_13: -+ case TOP_PATCHLIST_14: -+ case TOP_PATCHLIST_15: -+ case TOP_PATCHLIST_16: -+ case TOP_PATCHLIST_17: -+ case TOP_PATCHLIST_18: -+ case TOP_PATCHLIST_19: -+ case TOP_PATCHLIST_20: -+ case TOP_PATCHLIST_21: -+ case TOP_PATCHLIST_22: -+ case TOP_PATCHLIST_23: -+ case TOP_PATCHLIST_24: -+ case TOP_PATCHLIST_25: -+ case TOP_PATCHLIST_26: -+ case TOP_PATCHLIST_27: -+ case TOP_PATCHLIST_28: -+ case TOP_PATCHLIST_29: -+ case TOP_PATCHLIST_30: -+ case TOP_PATCHLIST_31: -+ case TOP_PATCHLIST_32: -+ numVerts = topology - TOP_PATCHLIST_BASE; -+ break; -+ default: -+ SWR_ASSERT(false, "Unsupported topology: %d", topology); -+ break; -+ } -+ -+ if (includeAdjVerts) -+ { -+ switch (topology) -+ { -+ case TOP_LISTSTRIP_ADJ: -+ case TOP_LINE_LIST_ADJ: numVerts = 4; break; -+ case TOP_TRI_STRIP_ADJ: -+ case TOP_TRI_LIST_ADJ: numVerts = 6; break; -+ default: break; -+ } -+ } -+ -+ return numVerts; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief StreamOut - Streams vertex data out to SO buffers. -+/// Generally, we are only streaming out a SIMDs worth of triangles. -+/// @param pDC - pointer to draw context. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris) -+static void StreamOut( -+ DRAW_CONTEXT* pDC, -+ PA_STATE& pa, -+ uint32_t workerId, -+ uint32_t* pPrimData) -+{ -+ RDTSC_START(FEStreamout); -+ -+ SWR_CONTEXT* pContext = pDC->pContext; -+ -+ const API_STATE& state = GetApiState(pDC); -+ const SWR_STREAMOUT_STATE &soState = state.soState; -+ -+ uint32_t streamIndex = 0; ///@todo Stream index will come from PA_STATE. -+ uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); -+ -+ // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex. -+ uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t); -+ -+ SWR_STREAMOUT_CONTEXT soContext = { 0 }; -+ -+ // Setup buffer state pointers. -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ soContext.pBuffer[i] = &state.soBuffer[i]; -+ } -+ -+ uint32_t numPrims = pa.NumPrims(); -+ for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) -+ { -+ DWORD slot = 0; -+ uint32_t soMask = soState.streamMasks[streamIndex]; -+ -+ // Write all entries into primitive data buffer for SOS. -+ while (_BitScanForward(&slot, soMask)) -+ { -+ __m128 attrib[MAX_ATTRIBUTES]; // prim attribs (always 4 wide) -+ uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT; -+ pa.AssembleSingle(paSlot, primIndex, attrib); -+ -+ // Attribute offset is relative offset from start of vertex. -+ // Note that attributes start at slot 1 in the PA buffer. We need to write this -+ // to prim data starting at slot 0. Which is why we do (slot - 1). -+ // Also note: GL works slightly differently, and needs slot 0 -+ uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t); -+ -+ // Store each vertex's attrib at appropriate locations in pPrimData buffer. -+ for (uint32_t v = 0; v < soVertsPerPrim; ++v) -+ { -+ uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride); -+ -+ _mm_store_ps((float*)pPrimDataAttrib, attrib[v]); -+ } -+ soMask &= ~(1 << slot); -+ } -+ -+ // Update pPrimData pointer -+ soContext.pPrimData = pPrimData; -+ -+ // Call SOS -+ state.pfnSoFunc[streamIndex](soContext); -+ } -+ -+ // Update SO write offset. The driver provides memory for the update. -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ if (state.soBuffer[i].pWriteOffset) -+ { -+ *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); -+ -+ // The SOS increments the existing write offset. So we don't want to increment -+ // the SoWriteOffset stat using an absolute offset instead of relative. -+ SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset); -+ } -+ } -+ -+ UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); -+ UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); -+ -+ RDTSC_STOP(FEStreamout, 1, 0); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes number of invocations. The current index represents -+/// the start of the SIMD. The max index represents how much work -+/// items are remaining. If there is less then a SIMD's left of work -+/// then return the remaining amount of work. -+/// @param curIndex - The start index for the SIMD. -+/// @param maxIndex - The last index for all work items. -+static INLINE uint32_t GetNumInvocations( -+ uint32_t curIndex, -+ uint32_t maxIndex) -+{ -+ uint32_t remainder = (maxIndex - curIndex); -+ return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Implements GS stage. -+/// @param pDC - pointer to draw context. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param pa - The primitive assembly object. -+/// @param pGsOut - output stream for GS -+template < -+ bool HasStreamOutT, -+ bool HasRastT> -+static void GeometryShaderStage( -+ DRAW_CONTEXT *pDC, -+ uint32_t workerId, -+ PA_STATE& pa, -+ void* pGsOut, -+ void* pCutBuffer, -+ uint32_t* pSoPrimData, -+ simdscalari primID) -+{ -+ RDTSC_START(FEGeometryShader); -+ -+ SWR_GS_CONTEXT gsContext; -+ SWR_CONTEXT* pContext = pDC->pContext; -+ -+ const API_STATE& state = GetApiState(pDC); -+ const SWR_GS_STATE* pState = &state.gsState; -+ -+ SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized"); -+ SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized"); -+ -+ gsContext.pStream[0] = (uint8_t*)pGsOut; -+ gsContext.pCutBuffer = (uint8_t*)pCutBuffer; -+ gsContext.PrimitiveID = primID; -+ -+ uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); -+ simdvector attrib[MAX_ATTRIBUTES]; -+ -+ // assemble all attributes for the input primitive -+ for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) -+ { -+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot; -+ pa.Assemble(attribSlot, attrib); -+ -+ for (uint32_t i = 0; i < numVertsPerPrim; ++i) -+ { -+ gsContext.vert[i].attrib[attribSlot] = attrib[i]; -+ } -+ } -+ -+ // assemble position -+ pa.Assemble(VERTEX_POSITION_SLOT, attrib); -+ for (uint32_t i = 0; i < numVertsPerPrim; ++i) -+ { -+ gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; -+ } -+ -+ const uint32_t vertexStride = sizeof(simdvertex); -+ const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH; -+ const uint32_t inputPrimStride = numSimdBatches * vertexStride; -+ const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH; -+ const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8; -+ const uint32_t cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH; -+ for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) -+ { -+ gsContext.InstanceID = instance; -+ -+ // execute the geometry shader -+ state.pfnGsFunc(GetPrivateState(pDC), &gsContext); -+ -+ gsContext.pStream[0] += instanceStride; -+ gsContext.pCutBuffer += cutInstanceStride; -+ } -+ -+ // record valid prims from the frontend to avoid over binning the newly generated -+ // prims from the GS -+ uint32_t numInputPrims = pa.NumPrims(); -+ -+ // set up new binner and state for the GS output topology -+ PFN_PROCESS_PRIMS pfnClipFunc = nullptr; -+ if (HasRastT) -+ { -+ switch (pState->outputTopology) -+ { -+ case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break; -+ case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break; -+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; -+ default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology); -+ } -+ } -+ -+ // foreach input prim: -+ // - setup a new PA based on the emitted verts for that prim -+ // - loop over the new verts, calling PA to assemble each prim -+ uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount; -+ uint32_t* pPrimitiveId = (uint32_t*)&primID; -+ -+ uint32_t totalPrimsGenerated = 0; -+ for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) -+ { -+ uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride; -+ uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride; -+ for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) -+ { -+ uint32_t numEmittedVerts = pVertexCount[inputPrim]; -+ if (numEmittedVerts == 0) -+ { -+ continue; -+ } -+ -+ uint8_t* pBase = pInstanceBase + instance * instanceStride; -+ uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride; -+ -+ DWORD numAttribs; -+ _BitScanReverse(&numAttribs, state.feAttribMask); -+ numAttribs++; -+ -+ PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBase, numEmittedVerts, numAttribs, pState->outputTopology, true); -+ -+ while (gsPa.GetNextStreamOutput()) -+ { -+ do -+ { -+ bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib); -+ -+ if (assemble) -+ { -+ totalPrimsGenerated += gsPa.NumPrims(); -+ -+ if (HasStreamOutT) -+ { -+ StreamOut(pDC, gsPa, workerId, pSoPrimData); -+ } -+ -+ if (HasRastT) -+ { -+ simdscalari vPrimId; -+ // pull primitiveID from the GS output if available -+ if (state.gsState.emitsPrimitiveID) -+ { -+ simdvector primIdAttrib[3]; -+ gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib); -+ vPrimId = _simd_castps_si(primIdAttrib[0].x); -+ } -+ else -+ { -+ vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); -+ } -+ -+ pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId); -+ } -+ } -+ } while (gsPa.NextPrim()); -+ } -+ } -+ } -+ -+ // update GS pipeline stats -+ UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount); -+ UPDATE_STAT(GsPrimitives, totalPrimsGenerated); -+ -+ RDTSC_STOP(FEGeometryShader, 1, 0); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Allocate GS buffers -+/// @param pDC - pointer to draw context. -+/// @param state - API state -+/// @param ppGsOut - pointer to GS output buffer allocation -+/// @param ppCutBuffer - pointer to GS output cut buffer allocation -+static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer) -+{ -+ SWR_ASSERT(state.gsState.gsEnable); -+ // allocate arena space to hold GS output verts -+ // @todo pack attribs -+ // @todo support multiple streams -+ const uint32_t vertexStride = sizeof(simdvertex); -+ const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH; -+ uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH; -+ *ppGsOut = pDC->arena.AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float)); -+ -+ // allocate arena space to hold cut buffer, which is essentially a bitfield sized to the -+ // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance -+ const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8; -+ const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH; -+ *ppCutBuffer = pDC->arena.AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float)); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate mask from remaining work. -+/// @param numWorkItems - Number of items being worked on by a SIMD. -+static INLINE simdscalari GenerateMask(uint32_t numWorkItems) -+{ -+ uint32_t numActive = (numWorkItems >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numWorkItems; -+ uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; -+ return _simd_castps_si(vMask(mask)); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Contains all data generated by the HS and passed to the -+/// tessellator and DS. -+struct TessellationThreadLocalData -+{ -+ ScalarPatch patchData[KNOB_SIMD_WIDTH]; -+ void* pTxCtx; -+ size_t tsCtxSize; -+ -+ simdscalar* pDSOutput; -+ size_t numDSOutputVectors; -+}; -+ -+THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Allocate tessellation data for this worker thread. -+INLINE -+static void AllocateTessellationData(SWR_CONTEXT* pContext) -+{ -+ /// @TODO - Don't use thread local storage. Use Worker local storage instead. -+ if (gt_pTessellationThreadData == nullptr) -+ { -+ gt_pTessellationThreadData = (TessellationThreadLocalData*) -+ _aligned_malloc(sizeof(TessellationThreadLocalData), 64); -+ memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData)); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Implements Tessellation Stages. -+/// @param pDC - pointer to draw context. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param pa - The primitive assembly object. -+/// @param pGsOut - output stream for GS -+template < -+ bool HasGeometryShaderT, -+ bool HasStreamOutT, -+ bool HasRastT> -+static void TessellationStages( -+ DRAW_CONTEXT *pDC, -+ uint32_t workerId, -+ PA_STATE& pa, -+ void* pGsOut, -+ void* pCutBuffer, -+ uint32_t* pSoPrimData, -+ simdscalari primID) -+{ -+ const API_STATE& state = GetApiState(pDC); -+ const SWR_TS_STATE& tsState = state.tsState; -+ SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro -+ -+ SWR_ASSERT(gt_pTessellationThreadData); -+ -+ HANDLE tsCtx = TSInitCtx( -+ tsState.domain, -+ tsState.partitioning, -+ tsState.tsOutputTopology, -+ gt_pTessellationThreadData->pTxCtx, -+ gt_pTessellationThreadData->tsCtxSize); -+ if (tsCtx == nullptr) -+ { -+ gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64); -+ tsCtx = TSInitCtx( -+ tsState.domain, -+ tsState.partitioning, -+ tsState.tsOutputTopology, -+ gt_pTessellationThreadData->pTxCtx, -+ gt_pTessellationThreadData->tsCtxSize); -+ } -+ SWR_ASSERT(tsCtx); -+ -+ PFN_PROCESS_PRIMS pfnClipFunc = nullptr; -+ if (HasRastT) -+ { -+ switch (tsState.postDSTopology) -+ { -+ case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break; -+ case TOP_LINE_LIST: pfnClipFunc = ClipLines; break; -+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; -+ default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology); -+ } -+ } -+ -+ SWR_HS_CONTEXT hsContext; -+ hsContext.pCPout = gt_pTessellationThreadData->patchData; -+ hsContext.PrimitiveID = primID; -+ -+ uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); -+ // Max storage for one attribute for an entire simdprimitive -+ simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM]; -+ -+ // assemble all attributes for the input primitives -+ for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot) -+ { -+ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot; -+ pa.Assemble(attribSlot, simdattrib); -+ -+ for (uint32_t i = 0; i < numVertsPerPrim; ++i) -+ { -+ hsContext.vert[i].attrib[attribSlot] = simdattrib[i]; -+ } -+ } -+ -+#if defined(_DEBUG) -+ memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); -+#endif -+ -+ // Run the HS -+ RDTSC_START(FEHullShader); -+ state.pfnHsFunc(GetPrivateState(pDC), &hsContext); -+ RDTSC_STOP(FEHullShader, 0, 0); -+ -+ uint32_t numPrims = pa.NumPrims(); -+ UPDATE_STAT(HsInvocations, numPrims); -+ -+ const uint32_t* pPrimId = (const uint32_t*)&primID; -+ -+ for (uint32_t p = 0; p < numPrims; ++p) -+ { -+ // Run Tessellator -+ SWR_TS_TESSELLATED_DATA tsData = { 0 }; -+ RDTSC_START(FETessellation); -+ TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData); -+ RDTSC_STOP(FETessellation, 0, 0); -+ -+ if (tsData.NumPrimitives == 0) -+ { -+ continue; -+ } -+ SWR_ASSERT(tsData.NumDomainPoints); -+ -+ // Allocate DS Output memory -+ uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; -+ size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs; -+ if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors) -+ { -+ _aligned_free(gt_pTessellationThreadData->pDSOutput); -+ gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(sizeof(simdvector) * requiredDSOutputVectors, 64); -+ gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors; -+ } -+ SWR_ASSERT(gt_pTessellationThreadData->pDSOutput); -+ SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors); -+ -+ // Run Domain Shader -+ SWR_DS_CONTEXT dsContext; -+ dsContext.PrimitiveID = pPrimId[p]; -+ dsContext.pCpIn = &hsContext.pCPout[p]; -+ dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; -+ dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; -+ dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; -+ dsContext.vectorStride = requiredDSVectorInvocations; -+ -+ for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset) -+ { -+ RDTSC_START(FEDomainShader); -+ state.pfnDsFunc(GetPrivateState(pDC), &dsContext); -+ RDTSC_STOP(FEDomainShader, 0, 0); -+ } -+ UPDATE_STAT(DsInvocations, tsData.NumDomainPoints); -+ -+ PA_TESS tessPa( -+ pDC, -+ dsContext.pOutputData, -+ dsContext.vectorStride, -+ tsState.numDsOutputAttribs, -+ tsData.ppIndices, -+ tsData.NumPrimitives, -+ tsState.postDSTopology); -+ -+ while (tessPa.HasWork()) -+ { -+ simdvector prim[3]; // Only deal with triangles, lines, or points -+ // PaAssemble returns false if there is not enough verts to assemble. -+ RDTSC_START(FEPAAssemble); -+ bool assemble = tessPa.Assemble(VERTEX_POSITION_SLOT, prim); -+ RDTSC_STOP(FEPAAssemble, 1, 0); -+ -+ if (assemble) -+ { -+ if (HasGeometryShaderT) -+ { -+ GeometryShaderStage( -+ pDC, workerId, tessPa, pGsOut, pCutBuffer, pSoPrimData, -+ _simd_set1_epi32(dsContext.PrimitiveID)); -+ } -+ else -+ { -+ if (HasStreamOutT) -+ { -+ StreamOut(pDC, tessPa, workerId, pSoPrimData); -+ } -+ -+ if (HasRastT) -+ { -+ SWR_ASSERT(pfnClipFunc); -+ pfnClipFunc(pDC, tessPa, workerId, prim, -+ GenMask(tessPa.NumPrims()), primID); -+ } -+ } -+ } // if (assemble) -+ -+ tessPa.NextPrim(); -+ -+ } // while (tessPa.HasWork()) -+ } // for (uint32_t p = 0; p < numPrims; ++p) -+ -+ TSDestroyCtx(tsCtx); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief FE handler for SwrDraw. -+/// @tparam IsIndexedT - Is indexed drawing enabled -+/// @tparam HasTessellationT - Is tessellation enabled -+/// @tparam HasGeometryShaderT - Is the geometry shader stage enabled -+/// @tparam HasStreamOutT - Is stream-out enabled -+/// @tparam HasRastT - Is rasterization enabled -+/// @param pContext - pointer to SWR context. -+/// @param pDC - pointer to draw context. -+/// @param workerId - thread's worker id. -+/// @param pUserData - Pointer to DRAW_WORK -+template < -+ bool IsIndexedT, -+ bool HasTessellationT, -+ bool HasGeometryShaderT, -+ bool HasStreamOutT, -+ bool HasRastT> -+void ProcessDraw( -+ SWR_CONTEXT *pContext, -+ DRAW_CONTEXT *pDC, -+ uint32_t workerId, -+ void *pUserData) -+{ -+ -+#if KNOB_ENABLE_TOSS_POINTS -+ if (KNOB_TOSS_QUEUE_FE) -+ { -+ pDC->doneFE = 1; -+ return; -+ } -+#endif -+ -+ RDTSC_START(FEProcessDraw); -+ -+ DRAW_WORK& work = *(DRAW_WORK*)pUserData; -+ const API_STATE& state = GetApiState(pDC); -+ __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); -+ SWR_VS_CONTEXT vsContext; -+ simdvertex vin; -+ -+ int indexSize = 0; -+ int32_t endVertex = work.numVerts; -+ const int32_t* pLastRequestedIndex = nullptr; -+ if (IsIndexedT) -+ { -+ switch (work.type) -+ { -+ case R32_UINT: -+ indexSize = sizeof(uint32_t); -+ pLastRequestedIndex = &(work.pIB[endVertex]); -+ break; -+ case R16_UINT: -+ indexSize = sizeof(uint16_t); -+ // nasty address offset to last index -+ pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex])); -+ break; -+ case R8_UINT: -+ indexSize = sizeof(uint8_t); -+ // nasty address offset to last index -+ pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex])); -+ break; -+ default: -+ SWR_ASSERT(0); -+ } -+ } -+ -+ SWR_FETCH_CONTEXT fetchInfo = { 0 }; -+ fetchInfo.pStreams = &state.vertexBuffers[0]; -+ fetchInfo.StartInstance = work.startInstance; -+ fetchInfo.StartVertex = 0; -+ -+ vsContext.pVin = &vin; -+ -+ if (IsIndexedT) -+ { -+ fetchInfo.BaseVertex = work.baseVertex; -+ -+ // if the entire index buffer isn't being consumed, set the last index -+ // so that fetches < a SIMD wide will be masked off -+ fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size); -+ if (pLastRequestedIndex < fetchInfo.pLastIndex) -+ { -+ fetchInfo.pLastIndex = pLastRequestedIndex; -+ } -+ } -+ else -+ { -+ fetchInfo.StartVertex = work.startVertex; -+ } -+ -+#ifdef KNOB_ENABLE_RDTSC -+ uint32_t numPrims = GetNumPrims(state.topology, work.numVerts); -+#endif -+ -+ void* pGsOut = nullptr; -+ void* pCutBuffer = nullptr; -+ if (HasGeometryShaderT) -+ { -+ AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer); -+ } -+ -+ if (HasTessellationT) -+ { -+ SWR_ASSERT(state.tsState.tsEnable == true); -+ SWR_ASSERT(state.pfnHsFunc != nullptr); -+ SWR_ASSERT(state.pfnDsFunc != nullptr); -+ -+ AllocateTessellationData(pContext); -+ } -+ else -+ { -+ SWR_ASSERT(state.tsState.tsEnable == false); -+ SWR_ASSERT(state.pfnHsFunc == nullptr); -+ SWR_ASSERT(state.pfnDsFunc == nullptr); -+ } -+ -+ // allocate space for streamout input prim data -+ uint32_t* pSoPrimData = nullptr; -+ if (HasStreamOutT) -+ { -+ pSoPrimData = (uint32_t*)pDC->arena.AllocAligned(4096, 16); -+ } -+ -+ // choose primitive assembler -+ PA_FACTORY paFactory(pDC, IsIndexedT, state.topology, work.numVerts); -+ PA_STATE& pa = paFactory.GetPA(); -+ -+ /// @todo: temporarily move instance loop in the FE to ensure SO ordering -+ for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) -+ { -+ simdscalari vIndex; -+ int32_t i = 0; -+ -+ if (IsIndexedT) -+ { -+ fetchInfo.pIndices = work.pIB; -+ } -+ else -+ { -+ vIndex = _simd_add_epi32(_simd_set1_epi32(i), vScale); -+ fetchInfo.pIndices = (const int32_t*)&vIndex; -+ } -+ -+ fetchInfo.CurInstance = instanceNum; -+ vsContext.InstanceID = instanceNum; -+ -+ while (pa.HasWork()) -+ { -+ // PaGetNextVsOutput currently has the side effect of updating some PA state machine state. -+ // So we need to keep this outside of (i < endVertex) check. -+ simdmask* pvCutIndices = nullptr; -+ if (IsIndexedT) -+ { -+ pvCutIndices = &pa.GetNextVsIndices(); -+ } -+ -+ simdvertex& vout = pa.GetNextVsOutput(); -+ vsContext.pVout = &vout; -+ -+ if (i < endVertex) -+ { -+ -+ // 1. Execute FS/VS for a single SIMD. -+ RDTSC_START(FEFetchShader); -+ state.pfnFetchFunc(fetchInfo, vin); -+ RDTSC_STOP(FEFetchShader, 0, 0); -+ -+ // forward fetch generated vertex IDs to the vertex shader -+ vsContext.VertexID = fetchInfo.VertexID; -+ -+ // Setup active mask for vertex shader. -+ vsContext.mask = GenerateMask(endVertex - i); -+ -+ // forward cut mask to the PA -+ if (IsIndexedT) -+ { -+ *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask)); -+ } -+ -+ UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex)); -+ -+#if KNOB_ENABLE_TOSS_POINTS -+ if (!KNOB_TOSS_FETCH) -+#endif -+ { -+ RDTSC_START(FEVertexShader); -+ state.pfnVertexFunc(GetPrivateState(pDC), &vsContext); -+ RDTSC_STOP(FEVertexShader, 0, 0); -+ -+ UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex)); -+ } -+ } -+ -+ // 2. Assemble primitives given the last two SIMD. -+ do -+ { -+ simdvector prim[MAX_NUM_VERTS_PER_PRIM]; -+ // PaAssemble returns false if there is not enough verts to assemble. -+ RDTSC_START(FEPAAssemble); -+ bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim); -+ RDTSC_STOP(FEPAAssemble, 1, 0); -+ -+#if KNOB_ENABLE_TOSS_POINTS -+ if (!KNOB_TOSS_FETCH) -+#endif -+ { -+#if KNOB_ENABLE_TOSS_POINTS -+ if (!KNOB_TOSS_VS) -+#endif -+ { -+ if (assemble) -+ { -+ UPDATE_STAT(IaPrimitives, pa.NumPrims()); -+ -+ if (HasTessellationT) -+ { -+ TessellationStages( -+ pDC, workerId, pa, pGsOut, pCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); -+ } -+ else if (HasGeometryShaderT) -+ { -+ GeometryShaderStage( -+ pDC, workerId, pa, pGsOut, pCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); -+ } -+ else -+ { -+ // If streamout is enabled then stream vertices out to memory. -+ if (HasStreamOutT) -+ { -+ StreamOut(pDC, pa, workerId, pSoPrimData); -+ } -+ -+ if (HasRastT) -+ { -+ SWR_ASSERT(pDC->pState->pfnProcessPrims); -+ pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, -+ GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID)); -+ } -+ } -+ } -+ } -+ } -+ } while (pa.NextPrim()); -+ -+ i += KNOB_SIMD_WIDTH; -+ if (IsIndexedT) -+ { -+ fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); -+ } -+ else -+ { -+ vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH)); -+ } -+ } -+ pa.Reset(); -+ } -+ -+ _ReadWriteBarrier(); -+ pDC->doneFE = true; -+ RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId); -+} -+// Explicit Instantiation of all combinations -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Expland points to give them area. -+/// @param tri - SOA vertices for triangles. -+static INLINE void ExpandPoint(simdvector tri[3], simdscalar size) -+{ -+ const float bloat = 0.5f; -+ -+ const __m256 vAdjust0X = _mm256_set_ps(-bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat); -+ const __m256 vAdjust0Y = _mm256_set_ps(-bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat); -+ const __m256 vAdjust1X = _mm256_set_ps(bloat, -bloat, bloat, -bloat, bloat, -bloat, bloat, -bloat); -+ const __m256 vAdjust1Y = _mm256_set_ps(bloat, bloat, bloat, bloat, bloat, bloat, bloat, bloat); -+ const __m256 vAdjust2X = _mm256_set_ps(bloat, bloat, bloat, bloat, bloat, bloat, bloat, bloat); -+ const __m256 vAdjust2Y = _mm256_set_ps(-bloat, bloat, -bloat, bloat, -bloat, bloat, -bloat, bloat); -+ -+ tri[0].x = _simd_fmadd_ps(size, vAdjust0X, tri[0].x); -+ tri[0].y = _simd_fmadd_ps(size, vAdjust0Y, tri[0].y); -+ tri[1].x = _simd_fmadd_ps(size, vAdjust1X, tri[1].x); -+ tri[1].y = _simd_fmadd_ps(size, vAdjust1Y, tri[1].y); -+ tri[2].x = _simd_fmadd_ps(size, vAdjust2X, tri[2].x); -+ tri[2].y = _simd_fmadd_ps(size, vAdjust2Y, tri[2].y); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Processes attributes for the backend based on linkage mask and -+/// linkage map. Essentially just doing an SOA->AOS conversion and pack. -+/// @param pDC - Draw context -+/// @param pa - Primitive Assembly state -+/// @param linkageMask - Specifies which VS outputs are routed to PS. -+/// @param pLinkageMap - maps VS attribute slot to PS slot -+/// @param triIndex - Triangle to process attributes for -+/// @param pBuffer - Output result -+template -+INLINE void ProcessAttributes( -+ DRAW_CONTEXT *pDC, -+ PA_STATE&pa, -+ uint32_t linkageMask, -+ const uint8_t* pLinkageMap, -+ uint32_t triIndex, -+ float *pBuffer) -+{ -+ DWORD slot = 0; -+ uint32_t mapIdx = 0; -+ while (_BitScanForward(&slot, linkageMask)) -+ { -+ linkageMask &= ~(1 << slot); // done with this bit. -+ -+ // compute absolute slot in vertex attrib array -+ uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx++]; -+ -+ __m128 attrib[3]; // triangle attribs (always 4 wide) -+ pa.AssembleSingle(inputSlot, triIndex, attrib); -+ -+ for (uint32_t i = 0; i < NumVerts; ++i) -+ { -+ _mm_store_ps(pBuffer, attrib[i]); -+ pBuffer += 4; -+ } -+ -+ // pad out the attrib buffer to 3 verts to ensure the triangle -+ // interpolation code in the pixel shader works correctly for the -+ // 3 topologies - point, line, tri. This effectively zeros out the -+ // effect of the missing vertices in the triangle interpolation. -+ for (uint32_t i = NumVerts; i < 3; ++i) -+ { -+ _mm_store_ps(pBuffer, attrib[NumVerts - 1]); -+ pBuffer += 4; -+ } -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Processes enabled user clip distances. Loads the active clip -+/// distances from the PA, sets up barycentric equations, and -+/// stores the results to the output buffer -+/// @param pa - Primitive Assembly state -+/// @param primIndex - primitive index to process -+/// @param clipDistMask - mask of enabled clip distances -+/// @param pUserClipBuffer - buffer to store results -+template -+void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer) -+{ -+ DWORD clipDist; -+ while (_BitScanForward(&clipDist, clipDistMask)) -+ { -+ clipDistMask &= ~(1 << clipDist); -+ uint32_t clipSlot = clipDist >> 2; -+ uint32_t clipComp = clipDist & 0x3; -+ uint32_t clipAttribSlot = clipSlot == 0 ? -+ VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT; -+ -+ __m128 primClipDist[3]; -+ pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); -+ -+ float vertClipDist[NumVerts]; -+ for (uint32_t e = 0; e < NumVerts; ++e) -+ { -+ OSALIGNSIMD(float) aVertClipDist[4]; -+ _mm_store_ps(aVertClipDist, primClipDist[e]); -+ vertClipDist[e] = aVertClipDist[clipComp]; -+ }; -+ -+ // setup plane equations for barycentric interpolation in the backend -+ float baryCoeff[NumVerts]; -+ for (uint32_t e = 0; e < NumVerts - 1; ++e) -+ { -+ baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1]; -+ } -+ baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1]; -+ -+ for (uint32_t e = 0; e < NumVerts; ++e) -+ { -+ *(pUserClipBuffer++) = baryCoeff[e]; -+ } -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping -+/// culling, viewport transform, etc. -+/// @param pDC - pointer to draw context. -+/// @param pa - The primitive assembly object. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param tri - Contains triangle position data for SIMDs worth of triangles. -+/// @param primID - Primitive ID for each triangle. -+void BinTriangles( -+ DRAW_CONTEXT *pDC, -+ PA_STATE& pa, -+ uint32_t workerId, -+ simdvector tri[3], -+ uint32_t triMask, -+ simdscalari primID) -+{ -+ RDTSC_START(FEBinTriangles); -+ -+ const API_STATE& state = GetApiState(pDC); -+ const SWR_RASTSTATE& rastState = state.rastState; -+ const SWR_FRONTEND_STATE& feState = state.frontendState; -+ const SWR_GS_STATE& gsState = state.gsState; -+ -+ // Simple wireframe mode for debugging purposes only -+ -+ simdscalar vRecipW0 = _simd_set1_ps(1.0f); -+ simdscalar vRecipW1 = _simd_set1_ps(1.0f); -+ simdscalar vRecipW2 = _simd_set1_ps(1.0f); -+ -+ if (!feState.vpTransformDisable) -+ { -+ // perspective divide -+ vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w); -+ vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w); -+ vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w); -+ -+ tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0); -+ tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1); -+ tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2); -+ -+ tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0); -+ tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1); -+ tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2); -+ -+ tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0); -+ tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1); -+ tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); -+ -+ // viewport transform to screen coords -+ viewportTransform<3>(tri, state.vpMatrix[0]); -+ } -+ -+ // bloat points to tri -+ if (pa.binTopology == TOP_POINT_LIST) -+ { -+ if (rastState.pointParam) -+ { -+ simdvector size[3]; -+ pa.Assemble(rastState.pointSizeAttrib, size); -+ ExpandPoint(tri, size[0].x); -+ } -+ else -+ { -+ ExpandPoint(tri, _simd_set1_ps(rastState.pointSize)); -+ } -+ } -+ -+ // convert to fixed point -+ simdscalari vXi[3], vYi[3]; -+ vXi[0] = fpToFixedPointVertical(tri[0].x); -+ vYi[0] = fpToFixedPointVertical(tri[0].y); -+ vXi[1] = fpToFixedPointVertical(tri[1].x); -+ vYi[1] = fpToFixedPointVertical(tri[1].y); -+ vXi[2] = fpToFixedPointVertical(tri[2].x); -+ vYi[2] = fpToFixedPointVertical(tri[2].y); -+ -+ // triangle setup -+ simdscalari vAi[3], vBi[3]; -+ triangleSetupABIntVertical(vXi, vYi, vAi, vBi); -+ -+ // determinant -+ simdscalari vDet[2]; -+ calcDeterminantIntVertical(vAi, vBi, vDet); -+ -+ // cull zero area -+ int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si()))); -+ int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si()))); -+ -+ int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2)); -+ -+ uint32_t origTriMask = triMask; -+ triMask &= ~cullZeroAreaMask; -+ -+ // determine front winding tris -+ // CW +det -+ // CCW -det -+ maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si()))); -+ maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si()))); -+ int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) ); -+ -+ uint32_t frontWindingTris; -+ if (rastState.frontWinding == SWR_FRONTWINDING_CW) -+ { -+ frontWindingTris = cwTriMask; -+ } -+ else -+ { -+ frontWindingTris = ~cwTriMask; -+ } -+ -+ // cull -+ uint32_t cullTris; -+ switch ((SWR_CULLMODE)rastState.cullMode) -+ { -+ case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; -+ case SWR_CULLMODE_NONE: cullTris = 0x0; break; -+ case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; -+ case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; -+ default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; -+ } -+ -+ triMask &= ~cullTris; -+ -+ if (origTriMask ^ triMask) -+ { -+ RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0); -+ } -+ -+ // compute per tri backface -+ uint32_t frontFaceMask = frontWindingTris; -+ -+ uint32_t *pPrimID = (uint32_t *)&primID; -+ DWORD triIndex = 0; -+ -+ if (!triMask) -+ { -+ goto endBinTriangles; -+ } -+ -+ // Calc bounding box of triangles -+ simdBBox bbox; -+ calcBoundingBoxIntVertical(vXi, vYi, bbox); -+ -+ // determine if triangle falls between pixel centers and discard -+ // only discard for non-MSAA case -+ // (left + 127) & ~255 -+ // (right + 128) & ~255 -+ -+ if(rastState.sampleCount == SWR_MULTISAMPLE_1X) -+ { -+ origTriMask = triMask; -+ -+ int cullCenterMask; -+ { -+ simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127)); -+ left = _simd_and_si(left, _simd_set1_epi32(~255)); -+ simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128)); -+ right = _simd_and_si(right, _simd_set1_epi32(~255)); -+ -+ simdscalari vMaskH = _simd_cmpeq_epi32(left, right); -+ -+ simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127)); -+ top = _simd_and_si(top, _simd_set1_epi32(~255)); -+ simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128)); -+ bottom = _simd_and_si(bottom, _simd_set1_epi32(~255)); -+ -+ simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom); -+ vMaskV = _simd_or_si(vMaskH, vMaskV); -+ cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV)); -+ } -+ -+ triMask &= ~cullCenterMask; -+ -+ if(origTriMask ^ triMask) -+ { -+ RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0); -+ } -+ } -+ -+ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive. -+ bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left)); -+ bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top)); -+ bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right)); -+ bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom)); -+ -+ // Cull tris completely outside scissor -+ { -+ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right); -+ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom); -+ simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); -+ uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); -+ triMask = triMask & ~maskOutsideScissor; -+ } -+ -+ if (!triMask) -+ { -+ goto endBinTriangles; -+ } -+ -+ // Convert triangle bbox to macrotile units. -+ bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); -+ bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); -+ bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); -+ bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); -+ -+ OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; -+ _simd_store_si((simdscalari*)aMTLeft, bbox.left); -+ _simd_store_si((simdscalari*)aMTRight, bbox.right); -+ _simd_store_si((simdscalari*)aMTTop, bbox.top); -+ _simd_store_si((simdscalari*)aMTBottom, bbox.bottom); -+ -+ // transpose verts needed for backend -+ /// @todo modify BE to take non-transformed verts -+ __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; -+ vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x); -+ vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y); -+ vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z); -+ vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2); -+ -+ // store render target array index -+ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; -+ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) -+ { -+ simdvector vRtai[3]; -+ pa.Assemble(VERTEX_RTAI_SLOT, vRtai); -+ simdscalari vRtaii; -+ vRtaii = _simd_castps_si(vRtai[0].x); -+ _simd_store_si((simdscalari*)aRTAI, vRtaii); -+ } -+ else -+ { -+ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); -+ } -+ -+ // scan remaining valid triangles and bin each separately -+ while (_BitScanForward(&triIndex, triMask)) -+ { -+ uint32_t linkageCount = state.linkageCount; -+ uint32_t linkageMask = state.linkageMask; -+ uint32_t numScalarAttribs = linkageCount * 4; -+ -+ BE_WORK work; -+ work.type = DRAW; -+ -+ TRIANGLE_WORK_DESC &desc = work.desc.tri; -+ -+ desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); -+ desc.triFlags.primID = pPrimID[triIndex]; -+ desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; -+ -+ work.pfnWork = gRasterizerTable[rastState.sampleCount]; -+ -+ // store active attribs -+ float *pAttribs = (float*)pDC->arena.AllocAligned(numScalarAttribs*3*sizeof(float), 16); -+ desc.pAttribs = pAttribs; -+ desc.numAttribs = linkageCount; -+ ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs); -+ -+ // store triangle vertex data -+ desc.pTriBuffer = (float*)pDC->arena.AllocAligned(4*4*sizeof(float), 16); -+ -+ _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); -+ _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); -+ _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); -+ _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); -+ -+ // store user clip distances -+ if (rastState.clipDistanceMask) -+ { -+ uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); -+ desc.pUserClipBuffer = (float*)pDC->arena.Alloc(numClipDist * 3 * sizeof(float)); -+ ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); -+ } -+ -+ MacroTileMgr *pTileMgr = pDC->pTileMgr; -+ for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) -+ { -+ for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) -+ { -+#if KNOB_ENABLE_TOSS_POINTS -+ if (!KNOB_TOSS_SETUP_TRIS) -+#endif -+ { -+ pTileMgr->enqueue(x, y, &work); -+ } -+ } -+ } -+ -+ triMask &= ~(1 << triIndex); -+ } -+ -+endBinTriangles: -+ RDTSC_STOP(FEBinTriangles, 1, 0); -+} -+ -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Bin SIMD points to the backend. Only supports point size of 1 -+/// @param pDC - pointer to draw context. -+/// @param pa - The primitive assembly object. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param tri - Contains point position data for SIMDs worth of points. -+/// @param primID - Primitive ID for each point. -+void BinPoints( -+ DRAW_CONTEXT *pDC, -+ PA_STATE& pa, -+ uint32_t workerId, -+ simdvector prim[3], -+ uint32_t primMask, -+ simdscalari primID) -+{ -+ RDTSC_START(FEBinPoints); -+ -+ simdvector& primVerts = prim[0]; -+ -+ const API_STATE& state = GetApiState(pDC); -+ const SWR_GS_STATE& gsState = state.gsState; -+ -+ // perspective divide -+ simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); -+ primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); -+ primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); -+ primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); -+ -+ // viewport transform to screen coords -+ viewportTransform<1>(&primVerts, state.vpMatrix[0]); -+ -+ // convert to fixed point -+ simdscalari vXi, vYi; -+ vXi = fpToFixedPointVertical(primVerts.x); -+ vYi = fpToFixedPointVertical(primVerts.y); -+ -+ // adjust for triangle rasterization rules - ie top-left rule -+ vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); -+ vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); -+ -+ // cull points off the top-left edge of the viewport -+ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); -+ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); -+ -+ // compute macro tile coordinates -+ simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); -+ simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); -+ -+ OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; -+ _simd_store_si((simdscalari*)aMacroX, macroX); -+ _simd_store_si((simdscalari*)aMacroY, macroY); -+ -+ // compute raster tile coordinates -+ simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); -+ simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); -+ -+ // compute raster tile relative x,y for coverage mask -+ simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); -+ simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); -+ -+ simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); -+ simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); -+ -+ OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; -+ OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; -+ _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); -+ _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); -+ -+ OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; -+ OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; -+ _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); -+ _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); -+ -+ OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; -+ _simd_store_ps((float*)aZ, primVerts.z); -+ -+ // store render target array index -+ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; -+ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) -+ { -+ simdvector vRtai; -+ pa.Assemble(VERTEX_RTAI_SLOT, &vRtai); -+ simdscalari vRtaii = _simd_castps_si(vRtai.x); -+ _simd_store_si((simdscalari*)aRTAI, vRtaii); -+ } -+ else -+ { -+ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); -+ } -+ -+ uint32_t *pPrimID = (uint32_t *)&primID; -+ DWORD primIndex = 0; -+ // scan remaining valid triangles and bin each separately -+ while (_BitScanForward(&primIndex, primMask)) -+ { -+ uint32_t linkageCount = state.linkageCount; -+ uint32_t linkageMask = state.linkageMask; -+ -+ uint32_t numScalarAttribs = linkageCount * 4; -+ -+ BE_WORK work; -+ work.type = DRAW; -+ -+ TRIANGLE_WORK_DESC &desc = work.desc.tri; -+ -+ // points are always front facing -+ desc.triFlags.frontFacing = 1; -+ desc.triFlags.primID = pPrimID[primIndex]; -+ desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; -+ -+ work.pfnWork = rastPoint; -+ -+ // store attributes -+ float *pAttribs = (float*)pDC->arena.AllocAligned(3 * numScalarAttribs * sizeof(float), 16); -+ desc.pAttribs = pAttribs; -+ desc.numAttribs = linkageCount; -+ -+ ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs); -+ -+ // store raster tile aligned x, y, perspective correct z -+ float *pTriBuffer = (float*)pDC->arena.AllocAligned(4 * sizeof(float), 16); -+ desc.pTriBuffer = pTriBuffer; -+ *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; -+ *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; -+ *pTriBuffer = aZ[primIndex]; -+ -+ uint32_t tX = aTileRelativeX[primIndex]; -+ uint32_t tY = aTileRelativeY[primIndex]; -+ -+ // pack the relative x,y into the coverageMask, the rasterizer will -+ // generate the true coverage mask from it -+ work.desc.tri.triFlags.coverageMask = tX | (tY << 4); -+ -+ // bin it -+ MacroTileMgr *pTileMgr = pDC->pTileMgr; -+#if KNOB_ENABLE_TOSS_POINTS -+ if (!KNOB_TOSS_SETUP_TRIS) -+#endif -+ { -+ pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); -+ } -+ primMask &= ~(1 << primIndex); -+ } -+ -+ RDTSC_STOP(FEBinPoints, 1, 0); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Bin SIMD lines to the backend. -+/// @param pDC - pointer to draw context. -+/// @param pa - The primitive assembly object. -+/// @param workerId - thread's worker id. Even thread has a unique id. -+/// @param tri - Contains line position data for SIMDs worth of points. -+/// @param primID - Primitive ID for each line. -+void BinLines( -+ DRAW_CONTEXT *pDC, -+ PA_STATE& pa, -+ uint32_t workerId, -+ simdvector prim[], -+ uint32_t primMask, -+ simdscalari primID) -+{ -+ RDTSC_START(FEBinLines); -+ -+ const API_STATE& state = GetApiState(pDC); -+ const SWR_RASTSTATE& rastState = state.rastState; -+ const SWR_FRONTEND_STATE& feState = state.frontendState; -+ const SWR_GS_STATE& gsState = state.gsState; -+ -+ simdscalar vRecipW0 = _simd_set1_ps(1.0f); -+ simdscalar vRecipW1 = _simd_set1_ps(1.0f); -+ -+ if (!feState.vpTransformDisable) -+ { -+ // perspective divide -+ vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w); -+ vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w); -+ -+ prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0); -+ prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1); -+ -+ prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0); -+ prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1); -+ -+ prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0); -+ prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1); -+ -+ // viewport transform to screen coords -+ viewportTransform<2>(prim, state.vpMatrix[0]); -+ } -+ -+ // convert to fixed point -+ simdscalari vXi[2], vYi[2]; -+ vXi[0] = fpToFixedPointVertical(prim[0].x); -+ vYi[0] = fpToFixedPointVertical(prim[0].y); -+ vXi[1] = fpToFixedPointVertical(prim[1].x); -+ vYi[1] = fpToFixedPointVertical(prim[1].y); -+ -+ // compute x-major vs y-major mask -+ simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1])); -+ simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1])); -+ simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength)); -+ uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask); -+ -+ // cull zero-length lines -+ simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si()); -+ vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si())); -+ -+ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); -+ -+ uint32_t *pPrimID = (uint32_t *)&primID; -+ -+ simdscalar vUnused = _simd_setzero_ps(); -+ -+ // Calc bounding box of lines -+ simdBBox bbox; -+ bbox.left = _simd_min_epi32(vXi[0], vXi[1]); -+ bbox.right = _simd_max_epi32(vXi[0], vXi[1]); -+ bbox.top = _simd_min_epi32(vYi[0], vYi[1]); -+ bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]); -+ -+ // bloat bbox by line width along minor axis -+ simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f); -+ simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); -+ simdBBox bloatBox; -+ bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi); -+ bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi); -+ bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi); -+ bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi); -+ -+ bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask); -+ bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask); -+ bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask); -+ bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask); -+ -+ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive. -+ bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left)); -+ bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top)); -+ bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right)); -+ bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom)); -+ -+ // Cull prims completely outside scissor -+ { -+ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right); -+ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom); -+ simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); -+ uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); -+ primMask = primMask & ~maskOutsideScissor; -+ } -+ -+ if (!primMask) -+ { -+ goto endBinLines; -+ } -+ -+ // Convert triangle bbox to macrotile units. -+ bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); -+ bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); -+ bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); -+ bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); -+ -+ OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; -+ _simd_store_si((simdscalari*)aMTLeft, bbox.left); -+ _simd_store_si((simdscalari*)aMTRight, bbox.right); -+ _simd_store_si((simdscalari*)aMTTop, bbox.top); -+ _simd_store_si((simdscalari*)aMTBottom, bbox.bottom); -+ -+ // transpose verts needed for backend -+ /// @todo modify BE to take non-transformed verts -+ __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; -+ vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused); -+ vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused); -+ vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused); -+ vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused); -+ -+ // store render target array index -+ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; -+ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) -+ { -+ simdvector vRtai[2]; -+ pa.Assemble(VERTEX_RTAI_SLOT, vRtai); -+ simdscalari vRtaii = _simd_castps_si(vRtai[0].x); -+ _simd_store_si((simdscalari*)aRTAI, vRtaii); -+ } -+ else -+ { -+ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); -+ } -+ -+ // scan remaining valid prims and bin each separately -+ DWORD primIndex; -+ while (_BitScanForward(&primIndex, primMask)) -+ { -+ uint32_t linkageCount = state.linkageCount; -+ uint32_t linkageMask = state.linkageMask; -+ uint32_t numScalarAttribs = linkageCount * 4; -+ -+ BE_WORK work; -+ work.type = DRAW; -+ -+ TRIANGLE_WORK_DESC &desc = work.desc.tri; -+ -+ desc.triFlags.frontFacing = 1; -+ desc.triFlags.primID = pPrimID[primIndex]; -+ desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; -+ desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; -+ -+ work.pfnWork = RasterizeLine; -+ -+ // store active attribs -+ desc.pAttribs = (float*)pDC->arena.AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); -+ desc.numAttribs = linkageCount; -+ ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs); -+ -+ // store line vertex data -+ desc.pTriBuffer = (float*)pDC->arena.AllocAligned(4 * 4 * sizeof(float), 16); -+ _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); -+ _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); -+ _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); -+ _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); -+ -+ // store user clip distances -+ if (rastState.clipDistanceMask) -+ { -+ uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); -+ desc.pUserClipBuffer = (float*)pDC->arena.Alloc(numClipDist * 2 * sizeof(float)); -+ ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); -+ } -+ -+ MacroTileMgr *pTileMgr = pDC->pTileMgr; -+ for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) -+ { -+ for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) -+ { -+#if KNOB_ENABLE_TOSS_POINTS -+ if (!KNOB_TOSS_SETUP_TRIS) -+#endif -+ { -+ pTileMgr->enqueue(x, y, &work); -+ } -+ } -+ } -+ -+ primMask &= ~(1 << primIndex); -+ } -+ -+endBinLines: -+ -+ RDTSC_STOP(FEBinLines, 1, 0); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h -new file mode 100644 -index 0000000..e8452c3 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h -@@ -0,0 +1,326 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file frontend.h -+* -+* @brief Definitions for Frontend which handles vertex processing, -+* primitive assembly, clipping, binning, etc. -+* -+******************************************************************************/ -+#pragma once -+#include "context.h" -+ -+INLINE -+__m128i fpToFixedPoint(const __m128 vIn) -+{ -+ __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE)); -+ return _mm_cvtps_epi32(vFixed); -+} -+ -+INLINE -+simdscalari fpToFixedPointVertical(const simdscalar vIn) -+{ -+ simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(FIXED_POINT_SCALE)); -+ return _simd_cvtps_epi32(vFixed); -+} -+ -+ -+// Calculates the A and B coefficients for the 3 edges of the triangle -+// -+// maths for edge equations: -+// standard form of a line in 2d -+// Ax + By + C = 0 -+// A = y0 - y1 -+// B = x1 - x0 -+// C = x0y1 - x1y0 -+INLINE -+void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB) -+{ -+ // vYsub = y1 y2 y0 dc -+ __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1)); -+ // vY = y0 y1 y2 dc -+ vA = _mm_sub_ps(vY, vYsub); -+ -+ // Result: -+ // A[0] = y0 - y1 -+ // A[1] = y1 - y2 -+ // A[2] = y2 - y0 -+ -+ // vXsub = x1 x2 x0 dc -+ __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1)); -+ // vX = x0 x1 x2 dc -+ vB = _mm_sub_ps(vXsub, vX); -+ -+ // Result: -+ // B[0] = x1 - x0 -+ // B[1] = x2 - x1 -+ // B[2] = x0 - x2 -+} -+ -+INLINE -+void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3]) -+{ -+ // generate edge equations -+ // A = y0 - y1 -+ // B = x1 - x0 -+ vA[0] = _simd_sub_ps(vY[0], vY[1]); -+ vA[1] = _simd_sub_ps(vY[1], vY[2]); -+ vA[2] = _simd_sub_ps(vY[2], vY[0]); -+ -+ vB[0] = _simd_sub_ps(vX[1], vX[0]); -+ vB[1] = _simd_sub_ps(vX[2], vX[1]); -+ vB[2] = _simd_sub_ps(vX[0], vX[2]); -+} -+ -+INLINE -+void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB) -+{ -+ // generate edge equations -+ // A = y0 - y1 -+ // B = x1 - x0 -+ // C = x0y1 - x1y0 -+ __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1)); -+ vA = _mm_sub_epi32(vY, vYsub); -+ -+ __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1)); -+ vB = _mm_sub_epi32(vXsub, vX); -+} -+ -+INLINE -+void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3]) -+{ -+ // A = y0 - y1 -+ // B = x1 - x0 -+ vA[0] = _simd_sub_epi32(vY[0], vY[1]); -+ vA[1] = _simd_sub_epi32(vY[1], vY[2]); -+ vA[2] = _simd_sub_epi32(vY[2], vY[0]); -+ -+ vB[0] = _simd_sub_epi32(vX[1], vX[0]); -+ vB[1] = _simd_sub_epi32(vX[2], vX[1]); -+ vB[2] = _simd_sub_epi32(vX[0], vX[2]); -+} -+// Calculate the determinant of the triangle -+// 2 vectors between the 3 points: P, Q -+// Px = x0-x2, Py = y0-y2 -+// Qx = x1-x2, Qy = y1-y2 -+// |Px Qx| -+// det = | | = PxQy - PyQx -+// |Py Qy| -+// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2) -+// try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx -+// : B[2]*A[1] - (-(y2-y0))*(-(x2-x1)) -+// : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1) -+// : B[2]*A[1] - A[2]*B[1] -+INLINE -+float calcDeterminantInt(const __m128i vA, const __m128i vB) -+{ -+ // vAShuf = [A1, A0, A2, A0] -+ __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1)); -+ // vBShuf = [B2, B0, B1, B0] -+ __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2)); -+ // vMul = [A1*B2, B1*A2] -+ __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf); -+ -+ // shuffle upper to lower -+ // vMul2 = [B1*A2, B1*A2] -+ __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2)); -+ //vMul = [A1*B2 - B1*A2] -+ vMul = _mm_sub_epi64(vMul, vMul2); -+ -+ // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned -+ OSALIGN(int64_t, 16) result; -+ _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul)); -+ -+ double fResult = (double)result; -+ fResult = fResult * (1.0 / FIXED_POINT16_SCALE); -+ -+ return (float)fResult; -+} -+ -+INLINE -+void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet) -+{ -+ // refer to calcDeterminantInt comment for calculation explanation -+ // A1*B2 -+ simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 -+ simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 -+ -+ simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]); -+ simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]); -+ -+ simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 -+ simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 -+ -+ // B1*A2 -+ simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]); -+ simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]); -+ -+ simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]); -+ simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]); -+ -+ simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo); -+ simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi); -+ -+ // A1*B2 - A2*B1 -+ simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo); -+ simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi); -+ -+ // shuffle 0 1 4 5 -> 0 1 2 3 -+ simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20); -+ simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31); -+ -+ pvDet[0] = vResultLo; -+ pvDet[1] = vResultHi; -+} -+ -+INLINE -+void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC) -+{ -+ // C = -Ax - By -+ vC = _mm_mul_ps(vA, vX); -+ __m128 vCy = _mm_mul_ps(vB, vY); -+ vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f)); -+ vC = _mm_sub_ps(vC, vCy); -+} -+ -+INLINE -+void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix) -+{ -+ vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00)); -+ vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30)); -+ -+ vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11)); -+ vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31)); -+ -+ vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22)); -+ vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32)); -+} -+ -+template -+INLINE -+void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRIX & vpMatrix) -+{ -+ simdscalar m00 = _simd_load1_ps(&vpMatrix.m00); -+ simdscalar m30 = _simd_load1_ps(&vpMatrix.m30); -+ simdscalar m11 = _simd_load1_ps(&vpMatrix.m11); -+ simdscalar m31 = _simd_load1_ps(&vpMatrix.m31); -+ simdscalar m22 = _simd_load1_ps(&vpMatrix.m22); -+ simdscalar m32 = _simd_load1_ps(&vpMatrix.m32); -+ -+ for (uint32_t i = 0; i < NumVerts; ++i) -+ { -+ v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); -+ v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); -+ v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); -+ } -+} -+ -+INLINE -+void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox) -+{ -+ // Need horizontal fp min here -+ __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1)); -+ __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2)); -+ -+ __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1)); -+ __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2)); -+ -+ -+ __m128i vMinX = _mm_min_epi32(vX, vX1); -+ vMinX = _mm_min_epi32(vMinX, vX2); -+ -+ __m128i vMaxX = _mm_max_epi32(vX, vX1); -+ vMaxX = _mm_max_epi32(vMaxX, vX2); -+ -+ __m128i vMinY = _mm_min_epi32(vY, vY1); -+ vMinY = _mm_min_epi32(vMinY, vY2); -+ -+ __m128i vMaxY = _mm_max_epi32(vY, vY1); -+ vMaxY = _mm_max_epi32(vMaxY, vY2); -+ -+ bbox.left = _mm_extract_epi32(vMinX, 0); -+ bbox.right = _mm_extract_epi32(vMaxX, 0); -+ bbox.top = _mm_extract_epi32(vMinY, 0); -+ bbox.bottom = _mm_extract_epi32(vMaxY, 0); -+ -+#if 0 -+ Jacob: A = _mm_shuffle_ps(X, Y, 0 0 0 0) -+B = _mm_shuffle_ps(Z, W, 0 0 0 0) -+A = _mm_shuffle_epi32(A, 3 0 3 0) -+A = _mm_shuffle_ps(A, B, 1 0 1 0) -+#endif -+ -+} -+ -+INLINE -+void calcBoundingBoxIntVertical(const simdscalari (&vX)[3], const simdscalari (&vY)[3], simdBBox &bbox) -+{ -+ simdscalari vMinX = vX[0]; -+ vMinX = _simd_min_epi32(vMinX, vX[1]); -+ vMinX = _simd_min_epi32(vMinX, vX[2]); -+ -+ simdscalari vMaxX = vX[0]; -+ vMaxX = _simd_max_epi32(vMaxX, vX[1]); -+ vMaxX = _simd_max_epi32(vMaxX, vX[2]); -+ -+ simdscalari vMinY = vY[0]; -+ vMinY = _simd_min_epi32(vMinY, vY[1]); -+ vMinY = _simd_min_epi32(vMinY, vY[2]); -+ -+ simdscalari vMaxY = vY[0]; -+ vMaxY = _simd_max_epi32(vMaxY, vY[1]); -+ vMaxY = _simd_max_epi32(vMaxY, vY[2]); -+ -+ bbox.left = vMinX; -+ bbox.right = vMaxX; -+ bbox.top = vMinY; -+ bbox.bottom = vMaxY; -+} -+ -+INLINE -+bool CanUseSimplePoints(DRAW_CONTEXT *pDC) -+{ -+ const API_STATE& state = GetApiState(pDC); -+ -+ return (state.rastState.pointSize == 1.0f && -+ !state.rastState.pointParam && -+ !state.rastState.pointSpriteEnable); -+} -+ -+uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements); -+uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts); -+ -+// Templated Draw front-end function. All combinations of template parameter values are available -+template -+void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+ -+void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -+ -+struct PA_STATE_BASE; // forward decl -+void BinTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector tri[3], uint32_t primMask, simdscalari primID); -+void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); -+void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); -+ -diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h -new file mode 100644 -index 0000000..6140790 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h -@@ -0,0 +1,139 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file knobs.h -+* -+* @brief Static (Compile-Time) Knobs for Core. -+* -+******************************************************************************/ -+#pragma once -+ -+#include -+#include -+ -+#define KNOB_ARCH_AVX 0 -+#define KNOB_ARCH_AVX2 1 -+#define KNOB_ARCH_AVX512 2 -+ -+/////////////////////////////////////////////////////////////////////////////// -+// Architecture validation -+/////////////////////////////////////////////////////////////////////////////// -+#if !defined(KNOB_ARCH) -+#define KNOB_ARCH KNOB_ARCH_AVX -+#endif -+ -+#if (KNOB_ARCH == KNOB_ARCH_AVX) -+#define KNOB_ARCH_ISA AVX -+#define KNOB_ARCH_STR "AVX" -+#define KNOB_SIMD_WIDTH 8 -+#elif (KNOB_ARCH == KNOB_ARCH_AVX2) -+#define KNOB_ARCH_ISA AVX2 -+#define KNOB_ARCH_STR "AVX2" -+#define KNOB_SIMD_WIDTH 8 -+#elif (KNOB_ARCH == KNOB_ARCH_AVX512) -+#define KNOB_ARCH_ISA AVX512F -+#define KNOB_ARCH_STR "AVX512" -+#define KNOB_SIMD_WIDTH 16 -+#error "AVX512 not yet supported" -+#else -+#error "Unknown architecture" -+#endif -+ -+#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING") -+ -+/////////////////////////////////////////////////////////////////////////////// -+// Configuration knobs -+/////////////////////////////////////////////////////////////////////////////// -+#define KNOB_MAX_NUM_THREADS 256 // Supports up to dual-HSW-Xeon. -+ -+// Maximum supported number of active vertex buffer streams -+#define KNOB_NUM_STREAMS 32 -+ -+// Maximum supported number of attributes per vertex -+#define KNOB_NUM_ATTRIBUTES 37 -+ -+// Maximum supported active viewports and scissors -+#define KNOB_NUM_VIEWPORTS_SCISSORS 16 -+ -+// Guardband range used by the clipper -+#define KNOB_GUARDBAND_WIDTH 4096.0f -+#define KNOB_GUARDBAND_HEIGHT 2048.0f -+ -+/////////////////////////////// -+// Macro tile configuration -+/////////////////////////////// -+ -+// raster tile dimensions -+#define KNOB_TILE_X_DIM 8 -+#define KNOB_TILE_X_DIM_SHIFT 3 -+#define KNOB_TILE_Y_DIM 8 -+#define KNOB_TILE_Y_DIM_SHIFT 3 -+ -+// fixed macrotile pixel dimension for now, eventually will be -+// dynamically set based on tile format and pixel size -+#define KNOB_MACROTILE_X_DIM 64 -+#define KNOB_MACROTILE_Y_DIM 64 -+#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8) -+#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8) -+#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 14 -+#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 14 -+#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT) -+#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT) -+ -+// total # of hot tiles available. This should be enough to -+// fully render a 16kx16k 128bpp render target -+#define KNOB_NUM_HOT_TILES_X 256 -+#define KNOB_NUM_HOT_TILES_Y 256 -+#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT -+#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT -+#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT -+ -+#if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4 -+#error "incompatible width/tile dimensions" -+#endif -+ -+#if KNOB_SIMD_WIDTH == 8 -+#define SIMD_TILE_X_DIM 4 -+#define SIMD_TILE_Y_DIM 2 -+#else -+#error "Invalid simd width" -+#endif -+ -+/////////////////////////////////////////////////////////////////////////////// -+// Optimization knobs -+/////////////////////////////////////////////////////////////////////////////// -+#define KNOB_USE_FAST_SRGB TRUE -+ -+// enables cut-aware primitive assembler -+#define KNOB_ENABLE_CUT_AWARE_PA TRUE -+ -+/////////////////////////////////////////////////////////////////////////////// -+// Debug knobs -+/////////////////////////////////////////////////////////////////////////////// -+//#define KNOB_ENABLE_RDTSC -+//#define KNOB_SWRC_TRACING -+ -+// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs. -+#if !defined(KNOB_ENABLE_TOSS_POINTS) -+#define KNOB_ENABLE_TOSS_POINTS 0 -+#endif -+ -diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h -new file mode 100644 -index 0000000..3f19555 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h -@@ -0,0 +1,98 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file knobs_init.h -+* -+* @brief Dynamic Knobs Initialization for Core. -+* -+******************************************************************************/ -+#pragma once -+ -+#include -+#include -+#include -+#include -+#include -+ -+// Assume the type is compatible with a 32-bit integer -+template -+static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue) -+{ -+ uint32_t value = 0; -+ if (sscanf(pOverride, "%u", &value)) -+ { -+ knobValue = static_cast(value); -+ } -+} -+ -+static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue) -+{ -+ size_t len = strlen(pOverride); -+ if (len == 1) -+ { -+ auto c = tolower(pOverride[0]); -+ if (c == 'y' || c == 't' || c == '1') -+ { -+ knobValue = true; -+ return; -+ } -+ if (c == 'n' || c == 'f' || c == '0') -+ { -+ knobValue = false; -+ return; -+ } -+ } -+ -+ // Try converting to a number and casting to bool -+ uint32_t value = 0; -+ if (sscanf(pOverride, "%u", &value)) -+ { -+ knobValue = value != 0; -+ return; -+ } -+} -+ -+static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue) -+{ -+ float value = knobValue; -+ if (sscanf(pOverride, "%f", &value)) -+ { -+ knobValue = value; -+ } -+} -+ -+template -+static inline void InitKnob(T& knob) -+{ -+ -+ // TODO, read registry first -+ -+ // Second, read environment variables -+ const char* pOverride = getenv(knob.Name()); -+ -+ if (pOverride) -+ { -+ auto knobValue = knob.Value(); -+ ConvertEnvToKnob(pOverride, knobValue); -+ knob.Value(knobValue); -+ } -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h -new file mode 100644 -index 0000000..f7d5263 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h -@@ -0,0 +1,562 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file multisample.h -+* -+******************************************************************************/ -+ -+#pragma once -+ -+#include "context.h" -+#include "format_traits.h" -+ -+INLINE -+uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount) -+{ -+ static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_MAX] {1, 2, 4, 8, 16}; -+ assert(sampleCount < SWR_MULTISAMPLE_TYPE_MAX); -+ return sampleCountLUT[sampleCount]; -+} -+ -+INLINE -+SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples) -+{ -+ switch(numSamples) -+ { -+ case 1: return SWR_MULTISAMPLE_1X; -+ case 2: return SWR_MULTISAMPLE_2X; -+ case 4: return SWR_MULTISAMPLE_4X; -+ case 8: return SWR_MULTISAMPLE_8X; -+ case 16: return SWR_MULTISAMPLE_16X; -+ default: assert(0); return SWR_MULTISAMPLE_1X; -+ } -+} -+ -+// hardcoded offsets based on Direct3d standard multisample positions -+// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner -+// coords are 0.8 fixed point offsets from (0, 0) -+template -+struct MultisampleTraits -+{ -+ INLINE static __m128i vXi(uint32_t sampleNum) = delete; -+ INLINE static __m128i vYi(uint32_t sampleNum) = delete; -+ INLINE static simdscalar vX(uint32_t sampleNum) = delete; -+ INLINE static simdscalar vY(uint32_t sampleNum) = delete; -+ INLINE static __m128i TileSampleOffsetsX() = delete; -+ INLINE static __m128i TileSampleOffsetsY() = delete; -+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete; -+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete; -+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete; -+ -+ static const uint32_t numSamples = 0; -+ static const uint32_t sampleMask = 0; -+}; -+ -+template<> -+struct MultisampleTraits -+{ -+ INLINE static __m128i vXi(uint32_t sampleNum) -+ { -+ static const __m128i X = _mm_set1_epi32(0x80); -+ return X; -+ } -+ -+ INLINE static __m128i vYi(uint32_t sampleNum) -+ { -+ static const __m128i Y = _mm_set1_epi32(0x80); -+ return Y; -+ } -+ -+ INLINE static simdscalar vX(uint32_t sampleNum) -+ { -+ static const simdscalar X = _simd_set1_ps(0.5f); -+ return X; -+ } -+ -+ INLINE static simdscalar vY(uint32_t sampleNum) -+ { -+ static const simdscalar Y = _simd_set1_ps(0.5f); -+ return Y; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsX() -+ { -+ static const uint32_t bboxLeftEdge = 0x80; -+ static const uint32_t bboxRightEdge = 0x80; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); -+ return tileSampleOffsetX; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsY() -+ { -+ static const uint32_t bboxTopEdge = 0x80; -+ static const uint32_t bboxBottomEdge = 0x80; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); -+ return tileSampleOffsetY; -+ } -+ -+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) -+ { -+ return 0; -+ } -+ -+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) -+ { -+ return 0; -+ } -+ -+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) -+ { -+ return 0; -+ } -+ -+ static const uint32_t numSamples = 1; -+ static const uint32_t sampleMask = 1; -+}; -+ -+template<> -+struct MultisampleTraits -+{ -+ INLINE static __m128i vXi(uint32_t sampleNum) -+ { -+ static const __m128i X[numSamples] {_mm_set1_epi32(0xC0), _mm_set1_epi32(0x40)}; -+ SWR_ASSERT(sampleNum < numSamples); -+ return X[sampleNum]; -+ } -+ -+ INLINE static __m128i vYi(uint32_t sampleNum) -+ { -+ static const __m128i Y[numSamples] {_mm_set1_epi32(0xC0), _mm_set1_epi32(0x40)}; -+ SWR_ASSERT(sampleNum < numSamples); -+ return Y[sampleNum]; -+ } -+ -+ INLINE static simdscalar vX(uint32_t sampleNum) -+ { -+ static const simdscalar X[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)}; -+ assert(sampleNum < numSamples); -+ return X[sampleNum]; -+ } -+ -+ INLINE static simdscalar vY(uint32_t sampleNum) -+ { -+ static const simdscalar Y[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)}; -+ assert(sampleNum < numSamples); -+ return Y[sampleNum]; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsX() -+ { -+ static const uint32_t bboxLeftEdge = 0x40; -+ static const uint32_t bboxRightEdge = 0xC0; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); -+ return tileSampleOffsetX; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsY() -+ { -+ static const uint32_t bboxTopEdge = 0x40; -+ static const uint32_t bboxBottomEdge = 0xC0; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); -+ return tileSampleOffsetY; -+ } -+ -+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileColorOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileColorOffsets[sampleNum]; -+ } -+ -+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileDepthOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileDepthOffsets[sampleNum]; -+ } -+ -+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileStencilOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileStencilOffsets[sampleNum]; -+ } -+ -+ static const uint32_t numSamples = 2; -+ static const uint32_t sampleMask = 0x3; -+}; -+ -+template<> -+struct MultisampleTraits -+{ -+ INLINE static __m128i vXi(uint32_t sampleNum) -+ { -+ static const __m128i X[numSamples] -+ {_mm_set1_epi32(0x60), _mm_set1_epi32(0xE0), _mm_set1_epi32(0x20), _mm_set1_epi32(0xA0)}; -+ SWR_ASSERT(sampleNum < numSamples); -+ return X[sampleNum]; -+ } -+ -+ INLINE static __m128i vYi(uint32_t sampleNum) -+ { -+ static const __m128i Y[numSamples] -+ {_mm_set1_epi32(0x20), _mm_set1_epi32(0x60), _mm_set1_epi32(0xA0), _mm_set1_epi32(0xE0)}; -+ SWR_ASSERT(sampleNum < numSamples); -+ return Y[sampleNum]; -+ } -+ -+ INLINE static simdscalar vX(uint32_t sampleNum) -+ { -+ static const simdscalar X[numSamples] -+ {_simd_set1_ps(0.375f), _simd_set1_ps(0.875), _simd_set1_ps(0.125), _simd_set1_ps(0.625)}; -+ assert(sampleNum < numSamples); -+ return X[sampleNum]; -+ } -+ -+ INLINE static simdscalar vY(uint32_t sampleNum) -+ { -+ static const simdscalar Y[numSamples] -+ {_simd_set1_ps(0.125), _simd_set1_ps(0.375f), _simd_set1_ps(0.625), _simd_set1_ps(0.875)}; -+ assert(sampleNum < numSamples); -+ return Y[sampleNum]; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsX() -+ { -+ static const uint32_t bboxLeftEdge = 0x20; -+ static const uint32_t bboxRightEdge = 0xE0; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); -+ return tileSampleOffsetX; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsY() -+ { -+ static const uint32_t bboxTopEdge = 0x20; -+ static const uint32_t bboxBottomEdge = 0xE0; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); -+ return tileSampleOffsetY; -+ } -+ -+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileColorOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileColorOffsets[sampleNum]; -+ } -+ -+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileDepthOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileDepthOffsets[sampleNum]; -+ } -+ -+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileStencilOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileStencilOffsets[sampleNum]; -+ } -+ -+ static const uint32_t numSamples = 4; -+ static const uint32_t sampleMask = 0xF; -+}; -+ -+template<> -+struct MultisampleTraits -+{ -+ INLINE static __m128i vXi(uint32_t sampleNum) -+ { -+ static const __m128i X[numSamples] -+ {_mm_set1_epi32(0x90), _mm_set1_epi32(0x70), _mm_set1_epi32(0xD0), _mm_set1_epi32(0x50), -+ _mm_set1_epi32(0x30), _mm_set1_epi32(0x10), _mm_set1_epi32(0xB0), _mm_set1_epi32(0xF0)}; -+ SWR_ASSERT(sampleNum < numSamples); -+ return X[sampleNum]; -+ } -+ -+ INLINE static __m128i vYi(uint32_t sampleNum) -+ { -+ static const __m128i Y[numSamples] -+ {_mm_set1_epi32(0x50), _mm_set1_epi32(0xB0), _mm_set1_epi32(0x90), _mm_set1_epi32(0x30), -+ _mm_set1_epi32(0xD0), _mm_set1_epi32(0x70), _mm_set1_epi32(0xF0), _mm_set1_epi32(0x10)}; -+ SWR_ASSERT(sampleNum < numSamples); -+ return Y[sampleNum]; -+ } -+ -+ INLINE static simdscalar vX(uint32_t sampleNum) -+ { -+ static const simdscalar X[numSamples] -+ {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.8125), _simd_set1_ps(0.3125), -+ _simd_set1_ps(0.1875), _simd_set1_ps(0.0625), _simd_set1_ps(0.6875), _simd_set1_ps(0.9375)}; -+ assert(sampleNum < numSamples); -+ return X[sampleNum]; -+ } -+ -+ INLINE static simdscalar vY(uint32_t sampleNum) -+ { -+ static const simdscalar Y[numSamples] -+ {_simd_set1_ps(0.3125), _simd_set1_ps(0.6875), _simd_set1_ps(0.5625), _simd_set1_ps(0.1875), -+ _simd_set1_ps(0.8125), _simd_set1_ps(0.4375), _simd_set1_ps(0.9375), _simd_set1_ps(0.0625)}; -+ assert(sampleNum < numSamples); -+ return Y[sampleNum]; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsX() -+ { -+ static const uint32_t bboxLeftEdge = 0x10; -+ static const uint32_t bboxRightEdge = 0xF0; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); -+ return tileSampleOffsetX; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsY() -+ { -+ static const uint32_t bboxTopEdge = 0x10; -+ static const uint32_t bboxBottomEdge = 0xF0; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); -+ return tileSampleOffsetY; -+ } -+ -+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileColorOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileColorOffsets[sampleNum]; -+ } -+ -+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileDepthOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileDepthOffsets[sampleNum]; -+ } -+ -+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileStencilOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileStencilOffsets[sampleNum]; -+ } -+ -+ static const uint32_t numSamples = 8; -+ static const uint32_t sampleMask = 0xFF; -+}; -+ -+template<> -+struct MultisampleTraits -+{ -+ INLINE static __m128i vXi(uint32_t sampleNum) -+ { -+ static const __m128i X[numSamples] -+ {_mm_set1_epi32(0x90), _mm_set1_epi32(0x70), _mm_set1_epi32(0x50), _mm_set1_epi32(0xC0), -+ _mm_set1_epi32(0x30), _mm_set1_epi32(0xA0), _mm_set1_epi32(0xD0), _mm_set1_epi32(0xB0), -+ _mm_set1_epi32(0x60), _mm_set1_epi32(0x80), _mm_set1_epi32(0x40), _mm_set1_epi32(0x20), -+ _mm_set1_epi32(0x00), _mm_set1_epi32(0xF0), _mm_set1_epi32(0xE0), _mm_set1_epi32(0x10)}; -+ SWR_ASSERT(sampleNum < numSamples); -+ return X[sampleNum]; -+ } -+ -+ INLINE static __m128i vYi(uint32_t sampleNum) -+ { -+ static const __m128i Y[numSamples] -+ {_mm_set1_epi32(0x90), _mm_set1_epi32(0x50), _mm_set1_epi32(0xA0), _mm_set1_epi32(0x70), -+ _mm_set1_epi32(0x60), _mm_set1_epi32(0xD0), _mm_set1_epi32(0xB0), _mm_set1_epi32(0x30), -+ _mm_set1_epi32(0xE0), _mm_set1_epi32(0x10), _mm_set1_epi32(0x20), _mm_set1_epi32(0xC0), -+ _mm_set1_epi32(0x80), _mm_set1_epi32(0x40), _mm_set1_epi32(0xF0), _mm_set1_epi32(0x00)}; -+ SWR_ASSERT(sampleNum < numSamples); -+ return Y[sampleNum]; -+ } -+ -+ INLINE static simdscalar vX(uint32_t sampleNum) -+ { -+ static const simdscalar X[numSamples] -+ {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.3125), _simd_set1_ps(0.7500), -+ _simd_set1_ps(0.1875), _simd_set1_ps(0.6250), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), -+ _simd_set1_ps(0.3750), _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.1250), -+ _simd_set1_ps(0.0000), _simd_set1_ps(0.9375), _simd_set1_ps(0.8750), _simd_set1_ps(0.0625)}; -+ assert(sampleNum < numSamples); -+ return X[sampleNum]; -+ } -+ -+ INLINE static simdscalar vY(uint32_t sampleNum) -+ { -+ static const simdscalar Y[numSamples] -+ {_simd_set1_ps(0.5625), _simd_set1_ps(0.3125), _simd_set1_ps(0.6250), _simd_set1_ps(0.4375), -+ _simd_set1_ps(0.3750), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), _simd_set1_ps(0.1875), -+ _simd_set1_ps(0.8750), _simd_set1_ps(0.0625), _simd_set1_ps(0.1250), _simd_set1_ps(0.7500), -+ _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.9375), _simd_set1_ps(0.0000)}; -+ assert(sampleNum < numSamples); -+ return Y[sampleNum]; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsX() -+ { -+ static const uint32_t bboxLeftEdge = 0x00; -+ static const uint32_t bboxRightEdge = 0xF0; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); -+ return tileSampleOffsetX; -+ } -+ -+ INLINE static __m128i TileSampleOffsetsY() -+ { -+ static const uint32_t bboxTopEdge = 0x00; -+ static const uint32_t bboxBottomEdge = 0xF0; -+ // BR, BL, UR, UL -+ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); -+ return tileSampleOffsetY; -+ } -+ -+ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileColorOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileColorOffsets[sampleNum]; -+ } -+ -+ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileDepthOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileDepthOffsets[sampleNum]; -+ } -+ -+ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) -+ { -+ static const uint32_t RasterTileStencilOffsets[numSamples] -+ { 0, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, -+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, -+ }; -+ assert(sampleNum < numSamples); -+ return RasterTileStencilOffsets[sampleNum]; -+ } -+ -+ static const uint32_t numSamples = 16; -+ static const uint32_t sampleMask = 0xFFFF; -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h -new file mode 100644 -index 0000000..52ea820 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h -@@ -0,0 +1,1205 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file pa.h -+* -+* @brief Definitions for primitive assembly. -+* N primitives are assembled at a time, where N is the SIMD width. -+* A state machine, that is specific for a given topology, drives the -+* assembly of vertices into triangles. -+* -+******************************************************************************/ -+#pragma once -+ -+#include "frontend.h" -+ -+struct PA_STATE -+{ -+ DRAW_CONTEXT *pDC; // draw context -+ uint8_t* pStreamBase; // vertex stream -+ uint32_t streamSizeInVerts; // total size of the input stream in verts -+ -+ // The topology the binner will use. In some cases the FE changes the topology from the api state. -+ PRIMITIVE_TOPOLOGY binTopology; -+ -+ PA_STATE() {} -+ PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) : -+ pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {} -+ -+ virtual bool HasWork() = 0; -+ virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; -+ virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; -+ virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0; -+ virtual bool NextPrim() = 0; -+ virtual simdvertex& GetNextVsOutput() = 0; -+ virtual bool GetNextStreamOutput() = 0; -+ virtual simdmask& GetNextVsIndices() = 0; -+ virtual uint32_t NumPrims() = 0; -+ virtual void Reset() = 0; -+ virtual simdscalari GetPrimID(uint32_t startID) = 0; -+}; -+ -+// The Optimized PA is a state machine that assembles triangles from vertex shader simd -+// output. Here is the sequence -+// 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd). -+// 2. Execute PA function to assemble and bin triangles. -+// a. The PA function is a set of functions that collectively make up the -+// state machine for a given topology. -+// 1. We use a state index to track which PA function to call. -+// b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle. -+// 1. We call this the current and previous simd vertex. -+// 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In -+// order to assemble the second triangle, for a triangle list, we'll need the -+// last vertex from the previous simd and the first 2 vertices from the current simd. -+// 3. At times the PA can assemble multiple triangles from the 2 simd vertices. -+// -+// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without -+// cuts -+struct PA_STATE_OPT : public PA_STATE -+{ -+ simdvertex leadingVertex; // For tri-fan -+ uint32_t numPrims; // Total number of primitives for draw. -+ uint32_t numPrimsComplete; // Total number of complete primitives. -+ -+ uint32_t numSimdPrims; // Number of prims in current simd. -+ -+ uint32_t cur; // index to current VS output. -+ uint32_t prev; // index to prev VS output. Not really needed in the state. -+ uint32_t first; // index to first VS output. Used for trifan. -+ -+ uint32_t counter; // state counter -+ bool reset; // reset state -+ -+ uint32_t primIDIncr; // how much to increment for each vector (typically vector / {1, 2}) -+ simdscalari primID; -+ -+ typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]); -+ typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); -+ -+ PFN_PA_FUNC pfnPaFunc; // PA state machine function for assembling 4 triangles. -+ PFN_PA_SINGLE_FUNC pfnPaSingleFunc; // PA state machine function for assembling single triangle. -+ -+ // state used to advance the PA when Next is called -+ PFN_PA_FUNC pfnPaNextFunc; -+ uint32_t nextNumSimdPrims; -+ uint32_t nextNumPrimsIncrement; -+ bool nextReset; -+ bool isStreaming; -+ -+ simdmask tmpIndices; // temporary index store for unused virtual function -+ -+ PA_STATE_OPT() {} -+ PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, -+ bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); -+ -+ bool HasWork() -+ { -+ return (this->numPrimsComplete < this->numPrims) ? true : false; -+ } -+ -+ simdvector& GetSimdVector(uint32_t index, uint32_t slot) -+ { -+ simdvertex* pVertex = (simdvertex*)pStreamBase; -+ return pVertex[index].attrib[slot]; -+ } -+ -+ // Assembles 4 triangles. Each simdvector is a single vertex from 4 -+ // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle. -+ bool Assemble(uint32_t slot, simdvector verts[]) -+ { -+ return this->pfnPaFunc(*this, slot, verts); -+ } -+ -+ // Assembles 1 primitive. Each simdscalar is a vertex (xyzw). -+ void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) -+ { -+ return this->pfnPaSingleFunc(*this, slot, primIndex, verts); -+ } -+ -+ bool NextPrim() -+ { -+ this->pfnPaFunc = this->pfnPaNextFunc; -+ this->numSimdPrims = this->nextNumSimdPrims; -+ this->numPrimsComplete += this->nextNumPrimsIncrement; -+ this->reset = this->nextReset; -+ -+ if (this->isStreaming) -+ { -+ this->reset = false; -+ } -+ -+ bool morePrims = false; -+ -+ if (this->numSimdPrims > 0) -+ { -+ morePrims = true; -+ this->numSimdPrims--; -+ } -+ else -+ { -+ this->counter = (this->reset) ? 0 : (this->counter + 1); -+ this->reset = false; -+ } -+ -+ this->pfnPaFunc = this->pfnPaNextFunc; -+ -+ if (!HasWork()) -+ { -+ morePrims = false; // no more to do -+ } -+ -+ return morePrims; -+ } -+ -+ simdvertex& GetNextVsOutput() -+ { -+ // increment cur and prev indices -+ const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH; -+ this->prev = this->cur; // prev is undefined for first state. -+ this->cur = this->counter % numSimdVerts; -+ -+ simdvertex* pVertex = (simdvertex*)pStreamBase; -+ return pVertex[this->cur]; -+ } -+ -+ simdmask& GetNextVsIndices() -+ { -+ // unused in optimized PA, pass tmp buffer back -+ return tmpIndices; -+ } -+ -+ bool GetNextStreamOutput() -+ { -+ this->prev = this->cur; -+ this->cur = this->counter; -+ -+ return HasWork(); -+ } -+ -+ uint32_t NumPrims() -+ { -+ return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ? -+ (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH; -+ } -+ -+ void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, -+ PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, -+ uint32_t numSimdPrims = 0, -+ uint32_t numPrimsIncrement = 0, -+ bool reset = false) -+ { -+ this->pfnPaNextFunc = pfnPaNextFunc; -+ this->nextNumSimdPrims = numSimdPrims; -+ this->nextNumPrimsIncrement = numPrimsIncrement; -+ this->nextReset = reset; -+ -+ this->pfnPaSingleFunc = pfnPaNextSingleFunc; -+ } -+ -+ void Reset() -+ { -+ this->numPrimsComplete = 0; -+ this->numSimdPrims = 0; -+ this->cur = 0; -+ this->prev = 0; -+ this->first = 0; -+ this->counter = 0; -+ this->reset = false; -+ } -+ -+ simdscalari GetPrimID(uint32_t startID) -+ { -+ return _simd_add_epi32(this->primID, -+ _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH))); -+ } -+}; -+ -+// helper C wrappers to avoid having to rewrite all the PA topology state functions -+INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, -+ PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, -+ uint32_t numSimdPrims = 0, -+ uint32_t numPrimsIncrement = 0, -+ bool reset = false) -+{ -+ return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); -+} -+INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot) -+{ -+ return pa.GetSimdVector(index, slot); -+} -+ -+INLINE __m128 swizzleLane0(const simdvector &a) -+{ -+ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); -+ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); -+ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); -+} -+ -+INLINE __m128 swizzleLane1(const simdvector &a) -+{ -+ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); -+ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); -+ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); -+} -+ -+INLINE __m128 swizzleLane2(const simdvector &a) -+{ -+ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); -+ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); -+ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); -+} -+ -+INLINE __m128 swizzleLane3(const simdvector &a) -+{ -+ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); -+ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); -+ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); -+} -+ -+INLINE __m128 swizzleLane4(const simdvector &a) -+{ -+ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); -+ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); -+ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); -+ -+} -+ -+INLINE __m128 swizzleLane5(const simdvector &a) -+{ -+ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); -+ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); -+ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); -+} -+ -+INLINE __m128 swizzleLane6(const simdvector &a) -+{ -+ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); -+ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); -+ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); -+} -+ -+INLINE __m128 swizzleLane7(const simdvector &a) -+{ -+ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); -+ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); -+ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); -+} -+ -+INLINE __m128 swizzleLaneN(const simdvector &a, int lane) -+{ -+ switch (lane) { -+ case 0: -+ return swizzleLane0(a); -+ case 1: -+ return swizzleLane1(a); -+ case 2: -+ return swizzleLane2(a); -+ case 3: -+ return swizzleLane3(a); -+ case 4: -+ return swizzleLane4(a); -+ case 5: -+ return swizzleLane5(a); -+ case 6: -+ return swizzleLane6(a); -+ case 7: -+ return swizzleLane7(a); -+ default: -+ return _mm_setzero_ps(); -+ } -+} -+ -+// Cut-aware primitive assembler. -+struct PA_STATE_CUT : public PA_STATE -+{ -+ simdmask* pCutIndices; // cut indices buffer, 1 bit per vertex -+ uint32_t numVerts; // number of vertices available in buffer store -+ uint32_t numAttribs; // number of attributes -+ int32_t numRemainingVerts; // number of verts remaining to be assembled -+ uint32_t numVertsToAssemble; // total number of verts to assemble for the draw -+ OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather -+ simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd -+ uint32_t numPrimsAssembled; // number of primitives that are fully assembled -+ uint32_t headVertex; // current unused vertex slot in vertex buffer store -+ uint32_t tailVertex; // beginning vertex currently assembling -+ uint32_t curVertex; // current unprocessed vertex -+ uint32_t startPrimId; // starting prim id -+ simdscalari vPrimId; // vector of prim ID -+ bool needOffsets; // need to compute gather offsets for current SIMD -+ uint32_t vertsPerPrim; -+ simdvertex tmpVertex; // temporary simdvertex for unimplemented API -+ bool processCutVerts; // vertex indices with cuts should be processed as normal, otherwise they -+ // are ignored. Fetch shader sends invalid verts on cuts that should be ignored -+ // while the GS sends valid verts for every index -+ // Topology state tracking -+ uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; -+ uint32_t curIndex; -+ bool reverseWinding; // indicates reverse winding for strips -+ int32_t adjExtraVert; // extra vert uses for tristrip w/ adj -+ -+ typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish); -+ PFN_PA_FUNC pfnPa; // per-topology function that processes a single vert -+ -+ PA_STATE_CUT() {} -+ PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, -+ uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts) -+ : PA_STATE(pDC, in_pStream, in_streamSizeInVerts) -+ { -+ numVerts = in_streamSizeInVerts; -+ numAttribs = in_numAttribs; -+ binTopology = topo; -+ needOffsets = false; -+ processCutVerts = in_processCutVerts; -+ -+ numVertsToAssemble = numRemainingVerts = in_numVerts; -+ numPrimsAssembled = 0; -+ headVertex = tailVertex = curVertex = 0; -+ -+ curIndex = 0; -+ pCutIndices = in_pIndices; -+ memset(indices, 0, sizeof(indices)); -+ vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); -+ reverseWinding = false; -+ adjExtraVert = -1; -+ -+ bool gsEnabled = pDC->pState->state.gsState.gsEnable; -+ vertsPerPrim = NumVertsPerPrim(topo, gsEnabled); -+ -+ switch (topo) -+ { -+ case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break; -+ case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break; -+ case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break; -+ case TOP_TRI_STRIP_ADJ: if (gsEnabled) -+ { -+ pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ; -+ } -+ else -+ { -+ pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ; -+ } -+ break; -+ -+ case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break; -+ case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break; -+ case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break; -+ case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break; -+ case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break; -+ default: assert(0 && "Unimplemented topology"); -+ } -+ } -+ -+ simdvertex& GetNextVsOutput() -+ { -+ uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; -+ this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts; -+ this->needOffsets = true; -+ return ((simdvertex*)pStreamBase)[vertexIndex]; -+ } -+ -+ simdmask& GetNextVsIndices() -+ { -+ uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; -+ simdmask* pCurCutIndex = this->pCutIndices + vertexIndex; -+ return *pCurCutIndex; -+ } -+ -+ simdvector& GetSimdVector(uint32_t index, uint32_t slot) -+ { -+ // unused -+ SWR_ASSERT(0 && "Not implemented"); -+ return this->tmpVertex.attrib[0]; -+ } -+ -+ bool GetNextStreamOutput() -+ { -+ this->headVertex += KNOB_SIMD_WIDTH; -+ this->needOffsets = true; -+ return HasWork(); -+ } -+ -+ simdscalari GetPrimID(uint32_t startID) -+ { -+ return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId); -+ } -+ -+ void Reset() -+ { -+ this->numRemainingVerts = this->numVertsToAssemble; -+ this->numPrimsAssembled = 0; -+ this->curIndex = 0; -+ this->curVertex = 0; -+ this->tailVertex = 0; -+ this->headVertex = 0; -+ this->reverseWinding = false; -+ this->adjExtraVert = -1; -+ this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); -+ } -+ -+ bool HasWork() -+ { -+ return this->numRemainingVerts > 0 || this->adjExtraVert != -1; -+ } -+ -+ bool IsVertexStoreFull() -+ { -+ return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex; -+ } -+ -+ void RestartTopology() -+ { -+ this->curIndex = 0; -+ this->reverseWinding = false; -+ this->adjExtraVert = -1; -+ } -+ -+ bool IsCutIndex(uint32_t vertex) -+ { -+ uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH; -+ uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1); -+ return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1; -+ } -+ -+ // iterates across the unprocessed verts until we hit the end or we -+ // have assembled SIMD prims -+ void ProcessVerts() -+ { -+ while (this->numPrimsAssembled != KNOB_SIMD_WIDTH && -+ this->numRemainingVerts > 0 && -+ this->curVertex != this->headVertex) -+ { -+ // if cut index, restart topology -+ if (IsCutIndex(this->curVertex)) -+ { -+ if (this->processCutVerts) -+ { -+ (this->*pfnPa)(this->curVertex, false); -+ } -+ // finish off tri strip w/ adj before restarting topo -+ if (this->adjExtraVert != -1) -+ { -+ (this->*pfnPa)(this->curVertex, true); -+ } -+ RestartTopology(); -+ } -+ else -+ { -+ (this->*pfnPa)(this->curVertex, false); -+ } -+ -+ this->curVertex = (this->curVertex + 1) % this->numVerts; -+ this->numRemainingVerts--; -+ } -+ -+ // special case last primitive for tri strip w/ adj -+ if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1) -+ { -+ (this->*pfnPa)(this->curVertex, true); -+ } -+ } -+ -+ void Advance() -+ { -+ // done with current batch -+ // advance tail to the current unsubmitted vertex -+ this->tailVertex = this->curVertex; -+ this->numPrimsAssembled = 0; -+ this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH)); -+ } -+ -+ bool NextPrim() -+ { -+ // if we've assembled enough prims, we can advance to the next set of verts -+ if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0) -+ { -+ Advance(); -+ } -+ return false; -+ } -+ -+ void ComputeOffsets() -+ { -+ for (uint32_t v = 0; v < this->vertsPerPrim; ++v) -+ { -+ simdscalari vIndices = *(simdscalari*)&this->indices[v][0]; -+ -+ // step to simdvertex batch -+ const uint32_t simdShift = 3; // @todo make knob -+ simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift); -+ this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex))); -+ -+ // step to index -+ const uint32_t simdMask = 0x7; // @todo make knob -+ simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask)); -+ this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float)))); -+ } -+ } -+ -+ bool Assemble(uint32_t slot, simdvector result[]) -+ { -+ // process any outstanding verts -+ ProcessVerts(); -+ -+ // return false if we don't have enough prims assembled -+ if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0) -+ { -+ return false; -+ } -+ -+ // cache off gather offsets given the current SIMD set of indices the first time we get an assemble -+ if (this->needOffsets) -+ { -+ ComputeOffsets(); -+ this->needOffsets = false; -+ } -+ -+ for (uint32_t v = 0; v < this->vertsPerPrim; ++v) -+ { -+ simdscalari offsets = this->vOffsets[v]; -+ -+ // step to attribute -+ offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector))); -+ -+ float* pBase = (float*)this->pStreamBase; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); -+ -+ // move base to next component -+ pBase += KNOB_SIMD_WIDTH; -+ } -+ } -+ -+ return true; -+ } -+ -+ void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3]) -+ { -+ // move to slot -+ for (uint32_t v = 0; v < this->vertsPerPrim; ++v) -+ { -+ uint32_t* pOffset = (uint32_t*)&this->vOffsets[v]; -+ uint32_t offset = pOffset[triIndex]; -+ offset += sizeof(simdvector) * slot; -+ float* pVert = (float*)&tri[v]; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ float* pComponent = (float*)(this->pStreamBase + offset); -+ pVert[c] = *pComponent; -+ offset += KNOB_SIMD_WIDTH * sizeof(float); -+ } -+ } -+ } -+ -+ uint32_t NumPrims() -+ { -+ return this->numPrimsAssembled; -+ } -+ -+ // Per-topology functions -+ void ProcessVertTriStrip(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 3) -+ { -+ // assembled enough verts for prim, add to gather indices -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ if (reverseWinding) -+ { -+ this->indices[1][this->numPrimsAssembled] = this->vert[2]; -+ this->indices[2][this->numPrimsAssembled] = this->vert[1]; -+ } -+ else -+ { -+ this->indices[1][this->numPrimsAssembled] = this->vert[1]; -+ this->indices[2][this->numPrimsAssembled] = this->vert[2]; -+ } -+ -+ // increment numPrimsAssembled -+ this->numPrimsAssembled++; -+ -+ // set up next prim state -+ this->vert[0] = this->vert[1]; -+ this->vert[1] = this->vert[2]; -+ this->curIndex = 2; -+ this->reverseWinding ^= 1; -+ } -+ } -+ -+ template -+ void AssembleTriStripAdj() -+ { -+ if (!gsEnabled) -+ { -+ this->vert[1] = this->vert[2]; -+ this->vert[2] = this->vert[4]; -+ -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[1]; -+ this->indices[2][this->numPrimsAssembled] = this->vert[2]; -+ -+ this->vert[4] = this->vert[2]; -+ this->vert[2] = this->vert[1]; -+ } -+ else -+ { -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[1]; -+ this->indices[2][this->numPrimsAssembled] = this->vert[2]; -+ this->indices[3][this->numPrimsAssembled] = this->vert[3]; -+ this->indices[4][this->numPrimsAssembled] = this->vert[4]; -+ this->indices[5][this->numPrimsAssembled] = this->vert[5]; -+ } -+ this->numPrimsAssembled++; -+ } -+ -+ -+ template -+ void ProcessVertTriStripAdj(uint32_t index, bool finish) -+ { -+ // handle last primitive of tristrip -+ if (finish && this->adjExtraVert != -1) -+ { -+ this->vert[3] = this->adjExtraVert; -+ AssembleTriStripAdj(); -+ this->adjExtraVert = -1; -+ return; -+ } -+ -+ switch (this->curIndex) -+ { -+ case 0: -+ case 1: -+ case 2: -+ case 4: -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ break; -+ case 3: -+ this->vert[5] = index; -+ this->curIndex++; -+ break; -+ case 5: -+ if (this->adjExtraVert == -1) -+ { -+ this->adjExtraVert = index; -+ } -+ else -+ { -+ this->vert[3] = index; -+ if (!gsEnabled) -+ { -+ AssembleTriStripAdj(); -+ -+ uint32_t nextTri[6]; -+ if (this->reverseWinding) -+ { -+ nextTri[0] = this->vert[4]; -+ nextTri[1] = this->vert[0]; -+ nextTri[2] = this->vert[2]; -+ nextTri[4] = this->vert[3]; -+ nextTri[5] = this->adjExtraVert; -+ } -+ else -+ { -+ nextTri[0] = this->vert[2]; -+ nextTri[1] = this->adjExtraVert; -+ nextTri[2] = this->vert[3]; -+ nextTri[4] = this->vert[4]; -+ nextTri[5] = this->vert[0]; -+ } -+ for (uint32_t i = 0; i < 6; ++i) -+ { -+ this->vert[i] = nextTri[i]; -+ } -+ -+ this->adjExtraVert = -1; -+ this->reverseWinding ^= 1; -+ } -+ else -+ { -+ this->curIndex++; -+ } -+ } -+ break; -+ case 6: -+ SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!"); -+ AssembleTriStripAdj(); -+ -+ uint32_t nextTri[6]; -+ if (this->reverseWinding) -+ { -+ nextTri[0] = this->vert[4]; -+ nextTri[1] = this->vert[0]; -+ nextTri[2] = this->vert[2]; -+ nextTri[4] = this->vert[3]; -+ nextTri[5] = this->adjExtraVert; -+ } -+ else -+ { -+ nextTri[0] = this->vert[2]; -+ nextTri[1] = this->adjExtraVert; -+ nextTri[2] = this->vert[3]; -+ nextTri[4] = this->vert[4]; -+ nextTri[5] = this->vert[0]; -+ } -+ for (uint32_t i = 0; i < 6; ++i) -+ { -+ this->vert[i] = nextTri[i]; -+ } -+ this->reverseWinding ^= 1; -+ this->adjExtraVert = index; -+ this->curIndex--; -+ break; -+ } -+ } -+ -+ void ProcessVertTriList(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 3) -+ { -+ // assembled enough verts for prim, add to gather indices -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[1]; -+ this->indices[2][this->numPrimsAssembled] = this->vert[2]; -+ -+ // increment numPrimsAssembled -+ this->numPrimsAssembled++; -+ -+ // set up next prim state -+ this->curIndex = 0; -+ } -+ } -+ -+ void ProcessVertTriListAdj(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 6) -+ { -+ // assembled enough verts for prim, add to gather indices -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[1]; -+ this->indices[2][this->numPrimsAssembled] = this->vert[2]; -+ this->indices[3][this->numPrimsAssembled] = this->vert[3]; -+ this->indices[4][this->numPrimsAssembled] = this->vert[4]; -+ this->indices[5][this->numPrimsAssembled] = this->vert[5]; -+ -+ // increment numPrimsAssembled -+ this->numPrimsAssembled++; -+ -+ // set up next prim state -+ this->curIndex = 0; -+ } -+ } -+ -+ void ProcessVertTriListAdjNoGs(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 6) -+ { -+ // assembled enough verts for prim, add to gather indices -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[2]; -+ this->indices[2][this->numPrimsAssembled] = this->vert[4]; -+ -+ // increment numPrimsAssembled -+ this->numPrimsAssembled++; -+ -+ // set up next prim state -+ this->curIndex = 0; -+ } -+ } -+ -+ -+ void ProcessVertLineList(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 2) -+ { -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[1]; -+ -+ this->numPrimsAssembled++; -+ this->curIndex = 0; -+ } -+ } -+ -+ void ProcessVertLineStrip(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 2) -+ { -+ // assembled enough verts for prim, add to gather indices -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[1]; -+ -+ // increment numPrimsAssembled -+ this->numPrimsAssembled++; -+ -+ // set up next prim state -+ this->vert[0] = this->vert[1]; -+ this->curIndex = 1; -+ } -+ } -+ -+ void ProcessVertLineStripAdj(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 4) -+ { -+ // assembled enough verts for prim, add to gather indices -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[1]; -+ this->indices[2][this->numPrimsAssembled] = this->vert[2]; -+ this->indices[3][this->numPrimsAssembled] = this->vert[3]; -+ -+ // increment numPrimsAssembled -+ this->numPrimsAssembled++; -+ -+ // set up next prim state -+ this->vert[0] = this->vert[1]; -+ this->vert[1] = this->vert[2]; -+ this->vert[2] = this->vert[3]; -+ this->curIndex = 3; -+ } -+ } -+ -+ void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 4) -+ { -+ // assembled enough verts for prim, add to gather indices -+ this->indices[0][this->numPrimsAssembled] = this->vert[1]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[2]; -+ -+ // increment numPrimsAssembled -+ this->numPrimsAssembled++; -+ -+ // set up next prim state -+ this->vert[0] = this->vert[1]; -+ this->vert[1] = this->vert[2]; -+ this->vert[2] = this->vert[3]; -+ this->curIndex = 3; -+ } -+ } -+ -+ void ProcessVertLineListAdj(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 4) -+ { -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[1]; -+ this->indices[2][this->numPrimsAssembled] = this->vert[2]; -+ this->indices[3][this->numPrimsAssembled] = this->vert[3]; -+ -+ this->numPrimsAssembled++; -+ this->curIndex = 0; -+ } -+ } -+ -+ void ProcessVertLineListAdjNoGs(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 4) -+ { -+ this->indices[0][this->numPrimsAssembled] = this->vert[1]; -+ this->indices[1][this->numPrimsAssembled] = this->vert[2]; -+ -+ this->numPrimsAssembled++; -+ this->curIndex = 0; -+ } -+ } -+ -+ void ProcessVertPointList(uint32_t index, bool finish) -+ { -+ this->vert[this->curIndex] = index; -+ this->curIndex++; -+ if (this->curIndex == 1) -+ { -+ this->indices[0][this->numPrimsAssembled] = this->vert[0]; -+ this->numPrimsAssembled++; -+ this->curIndex = 0; -+ } -+ } -+}; -+ -+// Primitive Assembly for data output from the DomainShader. -+struct PA_TESS : PA_STATE -+{ -+ PA_TESS( -+ DRAW_CONTEXT *in_pDC, -+ const simdscalar* in_pVertData, -+ uint32_t in_attributeStrideInVectors, -+ uint32_t in_numAttributes, -+ uint32_t* (&in_ppIndices)[3], -+ uint32_t in_numPrims, -+ PRIMITIVE_TOPOLOGY in_binTopology) : -+ -+ PA_STATE(in_pDC, nullptr, 0), -+ m_pVertexData(in_pVertData), -+ m_attributeStrideInVectors(in_attributeStrideInVectors), -+ m_numAttributes(in_numAttributes), -+ m_numPrims(in_numPrims) -+ { -+ m_vPrimId = _simd_setzero_si(); -+ binTopology = in_binTopology; -+ m_ppIndices[0] = in_ppIndices[0]; -+ m_ppIndices[1] = in_ppIndices[1]; -+ m_ppIndices[2] = in_ppIndices[2]; -+ -+ switch (binTopology) -+ { -+ case TOP_POINT_LIST: -+ m_numVertsPerPrim = 1; -+ break; -+ -+ case TOP_LINE_LIST: -+ m_numVertsPerPrim = 2; -+ break; -+ -+ case TOP_TRIANGLE_LIST: -+ m_numVertsPerPrim = 3; -+ break; -+ -+ default: -+ SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__); -+ break; -+ } -+ } -+ -+ bool HasWork() -+ { -+ return m_numPrims != 0; -+ } -+ -+ simdvector& GetSimdVector(uint32_t index, uint32_t slot) -+ { -+ SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__); -+ static simdvector junk = { 0 }; -+ return junk; -+ } -+ -+ static simdscalari GenPrimMask(uint32_t numPrims) -+ { -+ SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH); -+#if KNOB_SIMD_WIDTH == 8 -+ static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] = -+ { -+ -1, -1, -1, -1, -1, -1, -1, -1, -+ 0, 0, 0, 0, 0, 0, 0, 0 -+ }; -+#elif KNOB_SIMD_WIDTH == 16 -+ static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] = -+ { -+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -+ }; -+#else -+#error "Help, help, I can't get up!" -+#endif -+ -+ return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]); -+ } -+ -+ bool Assemble(uint32_t slot, simdvector verts[]) -+ { -+ static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented"); -+ SWR_ASSERT(slot < m_numAttributes); -+ -+ uint32_t numPrimsToAssemble = PA_TESS::NumPrims(); -+ if (0 == numPrimsToAssemble) -+ { -+ return false; -+ } -+ -+ simdscalari mask = GenPrimMask(numPrimsToAssemble); -+ -+ const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; -+ for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) -+ { -+ simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]); -+ -+ const float* pBase = pBaseAttrib; -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ verts[i].v[c] = _simd_mask_i32gather_ps( -+ _simd_setzero_ps(), -+ pBase, -+ indices, -+ _simd_castsi_ps(mask), -+ 4 /* gcc doesn't like sizeof(float) */); -+ pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; -+ } -+ } -+ -+ return true; -+ } -+ -+ void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) -+ { -+ SWR_ASSERT(slot < m_numAttributes); -+ SWR_ASSERT(primIndex < PA_TESS::NumPrims()); -+ -+ const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; -+ for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) -+ { -+ uint32_t index = m_ppIndices[i][primIndex]; -+ const float* pVertData = pVertDataBase; -+ float* pVert = (float*)&verts[i]; -+ -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ pVert[c] = pVertData[index]; -+ pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; -+ } -+ } -+ } -+ -+ bool NextPrim() -+ { -+ uint32_t numPrims = PA_TESS::NumPrims(); -+ m_numPrims -= numPrims; -+ m_ppIndices[0] += numPrims; -+ m_ppIndices[1] += numPrims; -+ m_ppIndices[2] += numPrims; -+ -+ return HasWork(); -+ } -+ -+ simdvertex& GetNextVsOutput() -+ { -+ SWR_ASSERT(0, "%s", __FUNCTION__); -+ static simdvertex junk; -+ return junk; -+ } -+ -+ bool GetNextStreamOutput() -+ { -+ SWR_ASSERT(0, "%s", __FUNCTION__); -+ return false; -+ } -+ -+ simdmask& GetNextVsIndices() -+ { -+ SWR_ASSERT(0, "%s", __FUNCTION__); -+ static simdmask junk; -+ return junk; -+ } -+ -+ uint32_t NumPrims() -+ { -+ return std::min(m_numPrims, KNOB_SIMD_WIDTH); -+ } -+ -+ void Reset() { SWR_ASSERT(0); }; -+ -+ simdscalari GetPrimID(uint32_t startID) -+ { -+ return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId); -+ } -+ -+private: -+ const simdscalar* m_pVertexData = nullptr; -+ uint32_t m_attributeStrideInVectors = 0; -+ uint32_t m_numAttributes = 0; -+ uint32_t m_numPrims = 0; -+ uint32_t* m_ppIndices[3]; -+ -+ uint32_t m_numVertsPerPrim = 0; -+ -+ simdscalari m_vPrimId; -+}; -+ -+// Primitive Assembler factory class, responsible for creating and initializing the correct assembler -+// based on state. -+struct PA_FACTORY -+{ -+ PA_FACTORY(DRAW_CONTEXT* pDC, bool isIndexed, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo) -+ { -+#if KNOB_ENABLE_CUT_AWARE_PA == TRUE -+ const API_STATE& state = GetApiState(pDC); -+ if ((isIndexed && ( -+ topo == TOP_TRIANGLE_STRIP || -+ (topo == TOP_POINT_LIST && CanUseSimplePoints(pDC)) || -+ topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP || -+ topo == TOP_TRIANGLE_LIST || topo == TOP_LINE_LIST_ADJ || -+ topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || -+ topo == TOP_TRI_STRIP_ADJ)) || -+ -+ // non-indexed draws with adjacency topologies must use cut-aware PA until we add support -+ // for them in the optimized PA -+ (!isIndexed && ( -+ topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))) -+ { -+ DWORD numAttribs; -+ _BitScanReverse(&numAttribs, state.feAttribMask); -+ numAttribs++; -+ this->paCut = PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, -+ &this->indexStore[0], numVerts, numAttribs, state.topology, false); -+ cutPA = true; -+ } -+ else -+#endif -+ { -+ uint32_t numPrims = GetNumPrims(in_topo, numVerts); -+ this->paOpt = PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false); -+ cutPA = false; -+ } -+ -+ } -+ -+ PA_STATE& GetPA() -+ { -+#if KNOB_ENABLE_CUT_AWARE_PA == TRUE -+ if (cutPA) -+ { -+ return this->paCut; -+ } -+ else -+#endif -+ { -+ return this->paOpt; -+ } -+ } -+ -+ PA_STATE_OPT paOpt; -+ PA_STATE_CUT paCut; -+ bool cutPA; -+ -+ PRIMITIVE_TOPOLOGY topo; -+ -+ simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM]; -+ simdmask indexStore[MAX_NUM_VERTS_PER_PRIM]; -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp -new file mode 100644 -index 0000000..6dce0bb ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp -@@ -0,0 +1,1330 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file pa_avx.cpp -+* -+* @brief AVX implementation for primitive assembly. -+* N primitives are assembled at a time, where N is the SIMD width. -+* A state machine, that is specific for a given topology, drives the -+* assembly of vertices into triangles. -+* -+******************************************************************************/ -+#include "context.h" -+#include "pa.h" -+#include "frontend.h" -+ -+#if (KNOB_SIMD_WIDTH == 8) -+ -+bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); -+ -+bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); -+ -+bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); -+ -+bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); -+ -+bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+ -+bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]); -+ -+bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]); -+ -+bool PaTriPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaTriPoints1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+void PaTriPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); -+void PaTriPointsSingle1(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); -+ -+bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); -+ -+bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); -+void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); -+ -+template -+void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) -+{ -+ // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output -+ // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute. -+ // Each attribute has 4 components. -+ -+ /// @todo Optimize this -+ -+ float* pOutVec = (float*)verts; -+ -+ for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) -+ { -+ uint32_t input_cp = primIndex * TotalControlPoints + cp; -+ uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; -+ uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; -+ -+ // Loop over all components of the attribute -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); -+ pOutVec[cp * 4 + i] = pInputVec[input_lane]; -+ } -+ } -+} -+ -+template -+static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ SetNextPaState( -+ pa, -+ PaPatchList, -+ PaPatchListSingle); -+ -+ return false; -+} -+ -+template -+static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output -+ // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute. -+ // Each attribute has 4 components. -+ -+ /// @todo Optimize this -+ -+ // Loop over all components of the attribute -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) -+ { -+ float vec[KNOB_SIMD_WIDTH]; -+ for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane) -+ { -+ uint32_t input_cp = lane * TotalControlPoints + cp; -+ uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; -+ uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; -+ -+ const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); -+ vec[lane] = pInputVec[input_lane]; -+ } -+ verts[cp][i] = _simd_loadu_ps(vec); -+ } -+ } -+ -+ SetNextPaState( -+ pa, -+ PaPatchList, -+ PaPatchListSingle, -+ 0, -+ KNOB_SIMD_WIDTH, -+ true); -+ -+ return true; -+} -+ -+#define PA_PATCH_LIST_TERMINATOR(N) \ -+ template<> bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\ -+ { return PaPatchListTerm(pa, slot, verts); } -+PA_PATCH_LIST_TERMINATOR(1) -+PA_PATCH_LIST_TERMINATOR(2) -+PA_PATCH_LIST_TERMINATOR(3) -+PA_PATCH_LIST_TERMINATOR(4) -+PA_PATCH_LIST_TERMINATOR(5) -+PA_PATCH_LIST_TERMINATOR(6) -+PA_PATCH_LIST_TERMINATOR(7) -+PA_PATCH_LIST_TERMINATOR(8) -+PA_PATCH_LIST_TERMINATOR(9) -+PA_PATCH_LIST_TERMINATOR(10) -+PA_PATCH_LIST_TERMINATOR(11) -+PA_PATCH_LIST_TERMINATOR(12) -+PA_PATCH_LIST_TERMINATOR(13) -+PA_PATCH_LIST_TERMINATOR(14) -+PA_PATCH_LIST_TERMINATOR(15) -+PA_PATCH_LIST_TERMINATOR(16) -+PA_PATCH_LIST_TERMINATOR(17) -+PA_PATCH_LIST_TERMINATOR(18) -+PA_PATCH_LIST_TERMINATOR(19) -+PA_PATCH_LIST_TERMINATOR(20) -+PA_PATCH_LIST_TERMINATOR(21) -+PA_PATCH_LIST_TERMINATOR(22) -+PA_PATCH_LIST_TERMINATOR(23) -+PA_PATCH_LIST_TERMINATOR(24) -+PA_PATCH_LIST_TERMINATOR(25) -+PA_PATCH_LIST_TERMINATOR(26) -+PA_PATCH_LIST_TERMINATOR(27) -+PA_PATCH_LIST_TERMINATOR(28) -+PA_PATCH_LIST_TERMINATOR(29) -+PA_PATCH_LIST_TERMINATOR(30) -+PA_PATCH_LIST_TERMINATOR(31) -+PA_PATCH_LIST_TERMINATOR(32) -+#undef PA_PATCH_LIST_TERMINATOR -+ -+bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ SetNextPaState(pa, PaTriList1, PaTriListSingle0); -+ return false; // Not enough vertices to assemble 4 or 8 triangles. -+} -+ -+bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ SetNextPaState(pa, PaTriList2, PaTriListSingle0); -+ return false; // Not enough vertices to assemble 8 triangles. -+} -+ -+bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, 0, slot); -+ simdvector& b = PaGetSimdVector(pa, 1, slot); -+ simdvector& c = PaGetSimdVector(pa, 2, slot); -+ simdscalar s; -+ -+ // Tri Pattern - provoking vertex is always v0 -+ // v0 -> 0 3 6 9 12 15 18 21 -+ // v1 -> 1 4 7 10 13 16 19 22 -+ // v2 -> 2 5 8 11 14 17 20 23 -+ -+ for(int i = 0; i < 4; ++i) -+ { -+ simdvector& v0 = verts[0]; -+ v0[i] = _simd_blend_ps(a[i], b[i], 0x92); -+ v0[i] = _simd_blend_ps(v0[i], c[i], 0x24); -+ v0[i] = _mm256_permute_ps(v0[i], 0x6C); -+ s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21); -+ v0[i] = _simd_blend_ps(v0[i], s, 0x44); -+ -+ simdvector& v1 = verts[1]; -+ v1[i] = _simd_blend_ps(a[i], b[i], 0x24); -+ v1[i] = _simd_blend_ps(v1[i], c[i], 0x49); -+ v1[i] = _mm256_permute_ps(v1[i], 0xB1); -+ s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21); -+ v1[i] = _simd_blend_ps(v1[i], s, 0x66); -+ -+ simdvector& v2 = verts[2]; -+ v2[i] = _simd_blend_ps(a[i], b[i], 0x49); -+ v2[i] = _simd_blend_ps(v2[i], c[i], 0x92); -+ v2[i] = _mm256_permute_ps(v2[i], 0xC6); -+ s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21); -+ v2[i] = _simd_blend_ps(v2[i], s, 0x22); -+ } -+ -+ SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true); -+ return true; -+} -+ -+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) -+{ -+ // We have 12 simdscalars contained within 3 simdvectors which -+ // hold at least 8 triangles worth of data. We want to assemble a single -+ // triangle with data in horizontal form. -+ simdvector& a = PaGetSimdVector(pa, 0, slot); -+ simdvector& b = PaGetSimdVector(pa, 1, slot); -+ simdvector& c = PaGetSimdVector(pa, 2, slot); -+ -+ // Convert from vertical to horizontal. -+ // Tri Pattern - provoking vertex is always v0 -+ // v0 -> 0 3 6 9 12 15 18 21 -+ // v1 -> 1 4 7 10 13 16 19 22 -+ // v2 -> 2 5 8 11 14 17 20 23 -+ switch(primIndex) -+ { -+ case 0: -+ verts[0] = swizzleLane0(a); -+ verts[1] = swizzleLane1(a); -+ verts[2] = swizzleLane2(a); -+ break; -+ case 1: -+ verts[0] = swizzleLane3(a); -+ verts[1] = swizzleLane4(a); -+ verts[2] = swizzleLane5(a); -+ break; -+ case 2: -+ verts[0] = swizzleLane6(a); -+ verts[1] = swizzleLane7(a); -+ verts[2] = swizzleLane0(b); -+ break; -+ case 3: -+ verts[0] = swizzleLane1(b); -+ verts[1] = swizzleLane2(b); -+ verts[2] = swizzleLane3(b); -+ break; -+ case 4: -+ verts[0] = swizzleLane4(b); -+ verts[1] = swizzleLane5(b); -+ verts[2] = swizzleLane6(b); -+ break; -+ case 5: -+ verts[0] = swizzleLane7(b); -+ verts[1] = swizzleLane0(c); -+ verts[2] = swizzleLane1(c); -+ break; -+ case 6: -+ verts[0] = swizzleLane2(c); -+ verts[1] = swizzleLane3(c); -+ verts[2] = swizzleLane4(c); -+ break; -+ case 7: -+ verts[0] = swizzleLane5(c); -+ verts[1] = swizzleLane6(c); -+ verts[2] = swizzleLane7(c); -+ break; -+ }; -+} -+ -+bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0); -+ return false; // Not enough vertices to assemble 8 triangles. -+} -+ -+bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); -+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); -+ simdscalar s; -+ -+ for(int i = 0; i < 4; ++i) -+ { -+ simdscalar a0 = a[i]; -+ simdscalar b0 = b[i]; -+ -+ // Tri Pattern - provoking vertex is always v0 -+ // v0 -> 01234567 -+ // v1 -> 13355779 -+ // v2 -> 22446688 -+ simdvector& v0 = verts[0]; -+ v0[i] = a0; -+ -+ // s -> 4567891011 -+ s = _mm256_permute2f128_ps(a0, b0, 0x21); -+ // s -> 23456789 -+ s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); -+ -+ simdvector& v1 = verts[1]; -+ // v1 -> 13355779 -+ v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1)); -+ -+ simdvector& v2 = verts[2]; -+ // v2 -> 22446688 -+ v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2)); -+ } -+ -+ SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH); -+ return true; -+} -+ -+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); -+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); -+ -+ // Convert from vertical to horizontal. -+ // Tri Pattern - provoking vertex is always v0 -+ // v0 -> 01234567 -+ // v1 -> 13355779 -+ // v2 -> 22446688 -+ switch(primIndex) -+ { -+ case 0: -+ verts[0] = swizzleLane0(a); -+ verts[1] = swizzleLane1(a); -+ verts[2] = swizzleLane2(a); -+ break; -+ case 1: -+ verts[0] = swizzleLane1(a); -+ verts[1] = swizzleLane3(a); -+ verts[2] = swizzleLane2(a); -+ break; -+ case 2: -+ verts[0] = swizzleLane2(a); -+ verts[1] = swizzleLane3(a); -+ verts[2] = swizzleLane4(a); -+ break; -+ case 3: -+ verts[0] = swizzleLane3(a); -+ verts[1] = swizzleLane5(a); -+ verts[2] = swizzleLane4(a); -+ break; -+ case 4: -+ verts[0] = swizzleLane4(a); -+ verts[1] = swizzleLane5(a); -+ verts[2] = swizzleLane6(a); -+ break; -+ case 5: -+ verts[0] = swizzleLane5(a); -+ verts[1] = swizzleLane7(a); -+ verts[2] = swizzleLane6(a); -+ break; -+ case 6: -+ verts[0] = swizzleLane6(a); -+ verts[1] = swizzleLane7(a); -+ verts[2] = swizzleLane0(b); -+ break; -+ case 7: -+ verts[0] = swizzleLane7(a); -+ verts[1] = swizzleLane1(b); -+ verts[2] = swizzleLane0(b); -+ break; -+ }; -+} -+ -+bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); -+ -+ // Extract vertex 0 to every lane of first vector -+ for(int i = 0; i < 4; ++i) -+ { -+ __m256 a0 = a[i]; -+ simdvector& v0 = verts[0]; -+ v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0)); -+ v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00); -+ } -+ -+ // store off leading vertex for attributes -+ simdvertex* pVertex = (simdvertex*)pa.pStreamBase; -+ pa.leadingVertex = pVertex[pa.cur]; -+ -+ SetNextPaState(pa, PaTriFan1, PaTriFanSingle0); -+ return false; // Not enough vertices to assemble 8 triangles. -+} -+ -+bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& leadVert = pa.leadingVertex.attrib[slot]; -+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); -+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); -+ simdscalar s; -+ -+ // need to fill vectors 1/2 with new verts, and v0 with anchor vert. -+ for(int i = 0; i < 4; ++i) -+ { -+ simdscalar a0 = a[i]; -+ simdscalar b0 = b[i]; -+ -+ __m256 comp = leadVert[i]; -+ simdvector& v0 = verts[0]; -+ v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0)); -+ v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00); -+ -+ simdvector& v2 = verts[2]; -+ s = _mm256_permute2f128_ps(a0, b0, 0x21); -+ v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); -+ -+ simdvector& v1 = verts[1]; -+ v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1)); -+ } -+ -+ SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH); -+ return true; -+} -+ -+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) -+{ -+ // vert 0 from leading vertex -+ simdvector& lead = pa.leadingVertex.attrib[slot]; -+ verts[0] = swizzleLane0(lead); -+ -+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); -+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); -+ -+ // vert 1 -+ if (primIndex < 7) -+ { -+ verts[1] = swizzleLaneN(a, primIndex + 1); -+ } -+ else -+ { -+ verts[1] = swizzleLane0(b); -+ } -+ -+ // vert 2 -+ if (primIndex < 6) -+ { -+ verts[2] = swizzleLaneN(a, primIndex + 2); -+ } -+ else -+ { -+ verts[2] = swizzleLaneN(b, primIndex - 6); -+ } -+} -+ -+bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ SetNextPaState(pa, PaQuadList1, PaQuadListSingle0); -+ return false; // Not enough vertices to assemble 8 triangles. -+} -+ -+bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, 0, slot); -+ simdvector& b = PaGetSimdVector(pa, 1, slot); -+ simdscalar s1, s2; -+ -+ for(int i = 0; i < 4; ++i) -+ { -+ simdscalar a0 = a[i]; -+ simdscalar b0 = b[i]; -+ -+ s1 = _mm256_permute2f128_ps(a0, b0, 0x20); -+ s2 = _mm256_permute2f128_ps(a0, b0, 0x31); -+ -+ simdvector& v0 = verts[0]; -+ v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0)); -+ -+ simdvector& v1 = verts[1]; -+ v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1)); -+ -+ simdvector& v2 = verts[2]; -+ v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2)); -+ } -+ -+ SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true); -+ return true; -+} -+ -+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, 0, slot); -+ simdvector& b = PaGetSimdVector(pa, 1, slot); -+ -+ switch (primIndex) -+ { -+ case 0: -+ // triangle 0 - 0 1 2 -+ verts[0] = swizzleLane0(a); -+ verts[1] = swizzleLane1(a); -+ verts[2] = swizzleLane2(a); -+ break; -+ -+ case 1: -+ // triangle 1 - 0 2 3 -+ verts[0] = swizzleLane0(a); -+ verts[1] = swizzleLane2(a); -+ verts[2] = swizzleLane3(a); -+ break; -+ -+ case 2: -+ // triangle 2 - 4 5 6 -+ verts[0] = swizzleLane4(a); -+ verts[1] = swizzleLane5(a); -+ verts[2] = swizzleLane6(a); -+ break; -+ -+ case 3: -+ // triangle 3 - 4 6 7 -+ verts[0] = swizzleLane4(a); -+ verts[1] = swizzleLane6(a); -+ verts[2] = swizzleLane7(a); -+ break; -+ -+ case 4: -+ // triangle 4 - 8 9 10 (0 1 2) -+ verts[0] = swizzleLane0(b); -+ verts[1] = swizzleLane1(b); -+ verts[2] = swizzleLane2(b); -+ break; -+ -+ case 5: -+ // triangle 1 - 0 2 3 -+ verts[0] = swizzleLane0(b); -+ verts[1] = swizzleLane2(b); -+ verts[2] = swizzleLane3(b); -+ break; -+ -+ case 6: -+ // triangle 2 - 4 5 6 -+ verts[0] = swizzleLane4(b); -+ verts[1] = swizzleLane5(b); -+ verts[2] = swizzleLane6(b); -+ break; -+ -+ case 7: -+ // triangle 3 - 4 6 7 -+ verts[0] = swizzleLane4(b); -+ verts[1] = swizzleLane6(b); -+ verts[2] = swizzleLane7(b); -+ break; -+ } -+} -+ -+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[]) -+{ -+ PaLineStripSingle0(pa, slot, lineIndex, verts); -+ -+ if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) { -+ simdvector &start = PaGetSimdVector(pa, pa.first, slot); -+ verts[1] = swizzleLane0(start); -+ } -+} -+ -+bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0); -+ return false; -+} -+ -+bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ PaLineStrip1(pa, slot, verts); -+ -+ if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) { -+ // loop reconnect now -+ int lane = pa.numPrims - pa.numPrimsComplete - 1; -+ simdvector &start = PaGetSimdVector(pa, pa.first, slot); -+ for (int i = 0; i < 4; i++) { -+ float *startVtx = (float *)&(start[i]); -+ float *targetVtx = (float *)&(verts[1][i]); -+ targetVtx[lane] = startVtx[0]; -+ } -+ } -+ -+ SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH); -+ return true; -+} -+ -+ -+bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ SetNextPaState(pa, PaLineList1, PaLineListSingle0); -+ return false; // Not enough vertices to assemble 8 lines -+} -+ -+bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, 0, slot); -+ simdvector& b = PaGetSimdVector(pa, 1, slot); -+ /// @todo: verify provoking vertex is correct -+ // Line list 0 1 2 3 4 5 6 7 -+ // 8 9 10 11 12 13 14 15 -+ -+ // shuffle: -+ // 0 2 4 6 8 10 12 14 -+ // 1 3 5 7 9 11 13 15 -+ -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ // 0 1 2 3 8 9 10 11 -+ __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20); -+ // 4 5 6 7 12 13 14 15 -+ __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31); -+ -+ // 0 2 4 6 8 10 12 14 -+ verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0)); -+ // 1 3 5 7 9 11 13 15 -+ verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1)); -+ } -+ -+ SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true); -+ return true; -+} -+ -+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) -+{ -+ simdvector &a = PaGetSimdVector(pa, pa.prev, slot); -+ simdvector &b = PaGetSimdVector(pa, pa.cur, slot); -+ -+ switch (primIndex) -+ { -+ case 0: -+ verts[0] = swizzleLane0(a); -+ verts[1] = swizzleLane1(a); -+ break; -+ case 1: -+ verts[0] = swizzleLane2(a); -+ verts[1] = swizzleLane3(a); -+ break; -+ case 2: -+ verts[0] = swizzleLane4(a); -+ verts[1] = swizzleLane5(a); -+ break; -+ case 3: -+ verts[0] = swizzleLane6(a); -+ verts[1] = swizzleLane7(a); -+ break; -+ case 4: -+ verts[0] = swizzleLane0(b); -+ verts[1] = swizzleLane1(b); -+ break; -+ case 5: -+ verts[0] = swizzleLane2(b); -+ verts[1] = swizzleLane3(b); -+ break; -+ case 6: -+ verts[0] = swizzleLane4(b); -+ verts[1] = swizzleLane5(b); -+ break; -+ case 7: -+ verts[0] = swizzleLane6(b); -+ verts[1] = swizzleLane7(b); -+ break; -+ } -+} -+ -+bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0); -+ return false; // Not enough vertices to assemble 8 lines -+} -+ -+bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); -+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); -+ -+ /// @todo: verify provoking vertex is correct -+ // Line list 0 1 2 3 4 5 6 7 -+ // 8 9 10 11 12 13 14 15 -+ -+ // shuffle: -+ // 0 1 2 3 4 5 6 7 -+ // 1 2 3 4 5 6 7 8 -+ -+ verts[0] = a; -+ -+ for(uint32_t i = 0; i < 4; ++i) -+ { -+ // 1 2 3 x 5 6 7 x -+ __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1) -+ // 4 5 6 7 8 9 10 11 -+ __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21); -+ -+ // x x x 4 x x x 8 -+ __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0) -+ -+ verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88); -+ } -+ -+ SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH); -+ return true; -+} -+ -+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); -+ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); -+ -+ switch (lineIndex) -+ { -+ case 0: -+ verts[0] = swizzleLane0(a); -+ verts[1] = swizzleLane1(a); -+ break; -+ case 1: -+ verts[0] = swizzleLane1(a); -+ verts[1] = swizzleLane2(a); -+ break; -+ case 2: -+ verts[0] = swizzleLane2(a); -+ verts[1] = swizzleLane3(a); -+ break; -+ case 3: -+ verts[0] = swizzleLane3(a); -+ verts[1] = swizzleLane4(a); -+ break; -+ case 4: -+ verts[0] = swizzleLane4(a); -+ verts[1] = swizzleLane5(a); -+ break; -+ case 5: -+ verts[0] = swizzleLane5(a); -+ verts[1] = swizzleLane6(a); -+ break; -+ case 6: -+ verts[0] = swizzleLane6(a); -+ verts[1] = swizzleLane7(a); -+ break; -+ case 7: -+ verts[0] = swizzleLane7(a); -+ verts[1] = swizzleLane0(b); -+ break; -+ } -+} -+ -+bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); -+ -+ verts[0] = a; // points only have 1 vertex. -+ -+ SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true); -+ return true; -+} -+ -+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) -+{ -+ simdvector &a = PaGetSimdVector(pa, pa.cur, slot); -+ switch(primIndex) -+ { -+ case 0: -+ verts[0] = swizzleLane0(a); -+ break; -+ case 1: -+ verts[0] = swizzleLane1(a); -+ break; -+ case 2: -+ verts[0] = swizzleLane2(a); -+ break; -+ case 3: -+ verts[0] = swizzleLane3(a); -+ break; -+ case 4: -+ verts[0] = swizzleLane4(a); -+ break; -+ case 5: -+ verts[0] = swizzleLane5(a); -+ break; -+ case 6: -+ verts[0] = swizzleLane6(a); -+ break; -+ case 7: -+ verts[0] = swizzleLane7(a); -+ break; -+ } -+} -+ -+// each point generates two tris -+// primitive assembly broadcasts each point to the 3 vertices of the 2 tris -+// binner will bloat each point -+// -+// input simd : p0 p1 p2 p3 p4 p5 p6 p7 == 8 points, 16 tris -+// output phase 0: -+// verts[0] : p0 p0 p1 p1 p2 p2 p3 p3 -+// verts[1] : p0 p0 p1 p1 p2 p2 p3 p3 -+// verts[2] : p0 p0 p1 p1 p2 p2 p3 p3 -+// -+// output phase 1: -+// verts[0] : p4 p4 p5 p5 p6 p6 p7 p7 -+// verts[1] : p4 p4 p5 p5 p6 p6 p7 p7 -+// verts[2] : p4 p4 p5 p5 p6 p6 p7 p7 -+ -+ -+// 0 1 2 3 4 5 6 7 -+ -+bool PaTriPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); -+ -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]); // 0 0 1 1 4 4 5 5 -+ __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]); // 2 2 3 3 6 6 7 7 -+ __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x20); // 0 0 1 1 2 2 3 3 -+ -+ verts[0].v[i] = verts[1].v[i] = verts[2].v[i] = vCombined; -+ } -+ -+ SetNextPaState(pa, PaTriPoints1, PaTriPointsSingle0, 1, KNOB_SIMD_WIDTH); -+ return true; -+} -+ -+bool PaTriPoints1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); -+ -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]); // 0 0 1 1 4 4 5 5 -+ __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]); // 2 2 3 3 6 6 7 7 -+ __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x31); // 4 4 5 5 6 6 7 7 -+ -+ verts[0].v[i] = verts[1].v[i] = verts[2].v[i] = vCombined; -+ } -+ -+ SetNextPaState(pa, PaTriPoints0, PaTriPointsSingle1, 0, KNOB_SIMD_WIDTH); -+ return true; -+ -+} -+ -+static void PaTriPointsSprite(PA_STATE_OPT& pa, uint32_t primIndex, __m128 verts[]) -+{ -+ const API_STATE& state = GetApiState(pa.pDC); -+ -+ if (!state.rastState.pointSpriteTopOrigin) { -+ if (primIndex & 1) { -+ verts[0] = _mm_set_ps(1, 0, 1, 0); -+ verts[1] = _mm_set_ps(1, 0, 0, 1); -+ verts[2] = _mm_set_ps(1, 0, 1, 1); -+ } else { -+ verts[0] = _mm_set_ps(1, 0, 1, 0); -+ verts[1] = _mm_set_ps(1, 0, 0, 0); -+ verts[2] = _mm_set_ps(1, 0, 0, 1); -+ } -+ } else { -+ if (primIndex & 1) { -+ verts[0] = _mm_set_ps(1, 0, 0, 0); -+ verts[1] = _mm_set_ps(1, 0, 1, 1); -+ verts[2] = _mm_set_ps(1, 0, 0, 1); -+ } else { -+ verts[0] = _mm_set_ps(1, 0, 0, 0); -+ verts[1] = _mm_set_ps(1, 0, 1, 0); -+ verts[2] = _mm_set_ps(1, 0, 1, 1); -+ } -+ } -+} -+ -+void PaTriPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) -+{ -+ const API_STATE& state = GetApiState(pa.pDC); -+ -+ if (state.rastState.pointSpriteEnable && state.rastState.pointSpriteFESlot == slot) { -+ return PaTriPointsSprite(pa, primIndex, verts); -+ } -+ -+ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); -+ -+ switch(primIndex) -+ { -+ case 0: -+ case 1: -+ verts[0] = verts[1] = verts[2] = swizzleLane0(a); break; -+ case 2: -+ case 3: -+ verts[0] = verts[1] = verts[2] = swizzleLane1(a); break; -+ case 4: -+ case 5: -+ verts[0] = verts[1] = verts[2] = swizzleLane2(a); break; -+ case 6: -+ case 7: -+ verts[0] = verts[1] = verts[2] = swizzleLane3(a); break; -+ } -+} -+ -+void PaTriPointsSingle1(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) -+{ -+ const API_STATE& state = GetApiState(pa.pDC); -+ -+ if (state.rastState.pointSpriteEnable && state.rastState.pointSpriteFESlot == slot) { -+ return PaTriPointsSprite(pa, primIndex, verts); -+ } -+ -+ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); -+ -+ switch(primIndex) -+ { -+ case 0: -+ case 1: -+ verts[0] = verts[1] = verts[2] = swizzleLane4(a); break; -+ case 2: -+ case 3: -+ verts[0] = verts[1] = verts[2] = swizzleLane5(a); break; -+ case 4: -+ case 5: -+ verts[0] = verts[1] = verts[2] = swizzleLane6(a); break; -+ case 6: -+ case 7: -+ verts[0] = verts[1] = verts[2] = swizzleLane7(a); break; -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief State 1 for RECT_LIST topology. -+/// There is not enough to assemble 8 triangles. -+bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) -+{ -+ SetNextPaState(pa, PaRectList1, PaRectListSingle0); -+ return false; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief State 1 for RECT_LIST topology. -+/// Rect lists has the following format. -+/// w x y z -+/// v2 o---o v5 o---o v8 o---o v11 o---o -+/// | \ | | \ | | \ | | \ | -+/// v1 o---o v4 o---o v7 o---o v10 o---o -+/// v0 v3 v6 v9 -+/// -+/// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied. -+/// -+/// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2 -+/// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5 -+/// etc. -+/// -+/// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2 -+/// where v0 contains all the first vertices for 8 triangles. -+/// -+/// Result: -+/// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 } -+/// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 } -+/// verts[2] = { v2, w, v5, x, v8, y, v11, z } -+/// -+/// @param pa - State for PA state machine. -+/// @param slot - Index into VS output which is either a position (slot 0) or attribute. -+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. -+bool PaRectList1( -+ PA_STATE_OPT& pa, -+ uint32_t slot, -+ simdvector verts[]) -+{ -+ // SIMD vectors a and b are the last two vertical outputs from the vertex shader. -+ simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } -+ simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } -+ -+ __m256 tmp0, tmp1, tmp2; -+ -+ // Loop over each component in the simdvector. -+ for(int i = 0; i < 4; ++i) -+ { -+ simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } -+ tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } -+ v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. -+ tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } -+ v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } -+ v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } -+ -+ /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. -+ /// AVX2 should make this much cheaper. -+ simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } -+ v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } -+ tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } -+ tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } -+ tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } -+ v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } -+ v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } -+ v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } -+ -+ // verts[2] = { v2, w, v5, x, v8, y, v11, z } -+ simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } -+ v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } -+ tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } -+ v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0); -+ -+ // Need to compute 4th implied vertex for the rectangle. -+ tmp2 = _mm256_sub_ps(v0[i], v1[i]); -+ tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * } -+ tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } -+ v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } -+ } -+ -+ SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true); -+ return true; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief State 2 for RECT_LIST topology. -+/// Not implemented unless there is a use case for more then 8 rects. -+/// @param pa - State for PA state machine. -+/// @param slot - Index into VS output which is either a position (slot 0) or attribute. -+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. -+bool PaRectList2( -+ PA_STATE_OPT& pa, -+ uint32_t slot, -+ simdvector verts[]) -+{ -+ SWR_ASSERT(0); // Is rect list used for anything other then clears? -+ SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true); -+ return true; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief This procedure is called by the Binner to assemble the attributes. -+/// Unlike position, which is stored vertically, the attributes are -+/// stored horizontally. The outputs from the VS, labeled as 'a' and -+/// 'b' are vertical. This function needs to transpose the lanes -+/// containing the vertical attribute data into horizontal form. -+/// @param pa - State for PA state machine. -+/// @param slot - Index into VS output for a given attribute. -+/// @param primIndex - Binner processes each triangle individually. -+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. -+void PaRectListSingle0( -+ PA_STATE_OPT& pa, -+ uint32_t slot, -+ uint32_t primIndex, -+ __m128 verts[]) -+{ -+ // We have 12 simdscalars contained within 3 simdvectors which -+ // hold at least 8 triangles worth of data. We want to assemble a single -+ // triangle with data in horizontal form. -+ simdvector& a = PaGetSimdVector(pa, 0, slot); -+ -+ // Convert from vertical to horizontal. -+ switch(primIndex) -+ { -+ case 0: -+ verts[0] = swizzleLane0(a); -+ verts[1] = swizzleLane1(a); -+ verts[2] = swizzleLane2(a); -+ break; -+ case 1: -+ verts[0] = swizzleLane0(a); -+ verts[1] = swizzleLane2(a); -+ verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2); -+ break; -+ case 2: -+ case 3: -+ case 4: -+ case 5: -+ case 6: -+ case 7: -+ SWR_ASSERT(0); -+ break; -+ }; -+} -+ -+PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, -+ bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), -+ cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming) -+{ -+ const API_STATE& state = GetApiState(pDC); -+ -+ this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo; -+ -+ switch (this->binTopology) -+ { -+ case TOP_TRIANGLE_LIST: -+ this->pfnPaFunc = PaTriList0; -+ break; -+ case TOP_TRIANGLE_STRIP: -+ this->pfnPaFunc = PaTriStrip0; -+ break; -+ case TOP_TRIANGLE_FAN: -+ this->pfnPaFunc = PaTriFan0; -+ break; -+ case TOP_QUAD_LIST: -+ this->pfnPaFunc = PaQuadList0; -+ this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles -+ break; -+ case TOP_QUAD_STRIP: -+ // quad strip pattern when decomposed into triangles is the same as verts strips -+ this->pfnPaFunc = PaTriStrip0; -+ this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles -+ break; -+ case TOP_LINE_LIST: -+ this->pfnPaFunc = PaLineList0; -+ this->numPrims = in_numPrims; -+ break; -+ case TOP_LINE_STRIP: -+ this->pfnPaFunc = PaLineStrip0; -+ this->numPrims = in_numPrims; -+ break; -+ case TOP_LINE_LOOP: -+ this->pfnPaFunc = PaLineLoop0; -+ this->numPrims = in_numPrims; -+ break; -+ case TOP_POINT_LIST: -+ // use point binner and rasterizer if supported -+ if (CanUseSimplePoints(pDC)) -+ { -+ this->pfnPaFunc = PaPoints0; -+ this->numPrims = in_numPrims; -+ } -+ else -+ { -+ this->pfnPaFunc = PaTriPoints0; -+ this->numPrims = in_numPrims * 2; // 1 point generates 2 tris -+ } -+ break; -+ case TOP_RECT_LIST: -+ this->pfnPaFunc = PaRectList0; -+ this->numPrims = in_numPrims * 2; -+ break; -+ -+ case TOP_PATCHLIST_1: -+ this->pfnPaFunc = PaPatchList<1>; -+ break; -+ case TOP_PATCHLIST_2: -+ this->pfnPaFunc = PaPatchList<2>; -+ break; -+ case TOP_PATCHLIST_3: -+ this->pfnPaFunc = PaPatchList<3>; -+ break; -+ case TOP_PATCHLIST_4: -+ this->pfnPaFunc = PaPatchList<4>; -+ break; -+ case TOP_PATCHLIST_5: -+ this->pfnPaFunc = PaPatchList<5>; -+ break; -+ case TOP_PATCHLIST_6: -+ this->pfnPaFunc = PaPatchList<6>; -+ break; -+ case TOP_PATCHLIST_7: -+ this->pfnPaFunc = PaPatchList<7>; -+ break; -+ case TOP_PATCHLIST_8: -+ this->pfnPaFunc = PaPatchList<8>; -+ break; -+ case TOP_PATCHLIST_9: -+ this->pfnPaFunc = PaPatchList<9>; -+ break; -+ case TOP_PATCHLIST_10: -+ this->pfnPaFunc = PaPatchList<10>; -+ break; -+ case TOP_PATCHLIST_11: -+ this->pfnPaFunc = PaPatchList<11>; -+ break; -+ case TOP_PATCHLIST_12: -+ this->pfnPaFunc = PaPatchList<12>; -+ break; -+ case TOP_PATCHLIST_13: -+ this->pfnPaFunc = PaPatchList<13>; -+ break; -+ case TOP_PATCHLIST_14: -+ this->pfnPaFunc = PaPatchList<14>; -+ break; -+ case TOP_PATCHLIST_15: -+ this->pfnPaFunc = PaPatchList<15>; -+ break; -+ case TOP_PATCHLIST_16: -+ this->pfnPaFunc = PaPatchList<16>; -+ break; -+ case TOP_PATCHLIST_17: -+ this->pfnPaFunc = PaPatchList<17>; -+ break; -+ case TOP_PATCHLIST_18: -+ this->pfnPaFunc = PaPatchList<18>; -+ break; -+ case TOP_PATCHLIST_19: -+ this->pfnPaFunc = PaPatchList<19>; -+ break; -+ case TOP_PATCHLIST_20: -+ this->pfnPaFunc = PaPatchList<20>; -+ break; -+ case TOP_PATCHLIST_21: -+ this->pfnPaFunc = PaPatchList<21>; -+ break; -+ case TOP_PATCHLIST_22: -+ this->pfnPaFunc = PaPatchList<22>; -+ break; -+ case TOP_PATCHLIST_23: -+ this->pfnPaFunc = PaPatchList<23>; -+ break; -+ case TOP_PATCHLIST_24: -+ this->pfnPaFunc = PaPatchList<24>; -+ break; -+ case TOP_PATCHLIST_25: -+ this->pfnPaFunc = PaPatchList<25>; -+ break; -+ case TOP_PATCHLIST_26: -+ this->pfnPaFunc = PaPatchList<26>; -+ break; -+ case TOP_PATCHLIST_27: -+ this->pfnPaFunc = PaPatchList<27>; -+ break; -+ case TOP_PATCHLIST_28: -+ this->pfnPaFunc = PaPatchList<28>; -+ break; -+ case TOP_PATCHLIST_29: -+ this->pfnPaFunc = PaPatchList<29>; -+ break; -+ case TOP_PATCHLIST_30: -+ this->pfnPaFunc = PaPatchList<30>; -+ break; -+ case TOP_PATCHLIST_31: -+ this->pfnPaFunc = PaPatchList<31>; -+ break; -+ case TOP_PATCHLIST_32: -+ this->pfnPaFunc = PaPatchList<32>; -+ break; -+ -+ default: -+ SWR_ASSERT(0); -+ break; -+ }; -+ -+ // simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); -+ // simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3); -+ simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); -+ simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); -+ -+ switch(this->binTopology) -+ { -+ case TOP_TRIANGLE_LIST: -+ case TOP_TRIANGLE_STRIP: -+ case TOP_TRIANGLE_FAN: -+ case TOP_LINE_STRIP: -+ case TOP_LINE_LIST: -+ case TOP_LINE_LOOP: -+ this->primIDIncr = 8; -+ this->primID = id8; -+ break; -+ case TOP_QUAD_LIST: -+ case TOP_QUAD_STRIP: -+ case TOP_RECT_LIST: -+ this->primIDIncr = 4; -+ this->primID = id4; -+ break; -+ case TOP_POINT_LIST: -+ if (CanUseSimplePoints(pDC)) -+ { -+ this->primIDIncr = 8; -+ this->primID = id8; -+ } -+ else -+ { -+ this->primIDIncr = 4; -+ this->primID = id4; -+ } -+ break; -+ case TOP_PATCHLIST_1: -+ case TOP_PATCHLIST_2: -+ case TOP_PATCHLIST_3: -+ case TOP_PATCHLIST_4: -+ case TOP_PATCHLIST_5: -+ case TOP_PATCHLIST_6: -+ case TOP_PATCHLIST_7: -+ case TOP_PATCHLIST_8: -+ case TOP_PATCHLIST_9: -+ case TOP_PATCHLIST_10: -+ case TOP_PATCHLIST_11: -+ case TOP_PATCHLIST_12: -+ case TOP_PATCHLIST_13: -+ case TOP_PATCHLIST_14: -+ case TOP_PATCHLIST_15: -+ case TOP_PATCHLIST_16: -+ case TOP_PATCHLIST_17: -+ case TOP_PATCHLIST_18: -+ case TOP_PATCHLIST_19: -+ case TOP_PATCHLIST_20: -+ case TOP_PATCHLIST_21: -+ case TOP_PATCHLIST_22: -+ case TOP_PATCHLIST_23: -+ case TOP_PATCHLIST_24: -+ case TOP_PATCHLIST_25: -+ case TOP_PATCHLIST_26: -+ case TOP_PATCHLIST_27: -+ case TOP_PATCHLIST_28: -+ case TOP_PATCHLIST_29: -+ case TOP_PATCHLIST_30: -+ case TOP_PATCHLIST_31: -+ case TOP_PATCHLIST_32: -+ // Always run KNOB_SIMD_WIDTH number of patches at a time. -+ this->primIDIncr = 8; -+ this->primID = id8; -+ break; -+ -+ default: -+ SWR_ASSERT(0); -+ break; -+ }; -+ -+} -+#endif -diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp -new file mode 100644 -index 0000000..71de298 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp -@@ -0,0 +1,1217 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file rasterizer.cpp -+* -+* @brief Implementation for the rasterizer. -+* -+******************************************************************************/ -+ -+#include -+#include -+ -+#include "rasterizer.h" -+#include "multisample.h" -+#include "rdtsc_core.h" -+#include "backend.h" -+#include "utils.h" -+#include "frontend.h" -+#include "tilemgr.h" -+#include "memory/tilingtraits.h" -+ -+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, -+ uint32_t numSamples, uint32_t renderTargetArrayIndex); -+void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep); -+void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, -+ uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep); -+ -+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} -+const __m128 gMaskToVec[] = { -+ MASKTOVEC(0,0,0,0), -+ MASKTOVEC(0,0,0,1), -+ MASKTOVEC(0,0,1,0), -+ MASKTOVEC(0,0,1,1), -+ MASKTOVEC(0,1,0,0), -+ MASKTOVEC(0,1,0,1), -+ MASKTOVEC(0,1,1,0), -+ MASKTOVEC(0,1,1,1), -+ MASKTOVEC(1,0,0,0), -+ MASKTOVEC(1,0,0,1), -+ MASKTOVEC(1,0,1,0), -+ MASKTOVEC(1,0,1,1), -+ MASKTOVEC(1,1,0,0), -+ MASKTOVEC(1,1,0,1), -+ MASKTOVEC(1,1,1,0), -+ MASKTOVEC(1,1,1,1), -+}; -+ -+const __m256d gMaskToVecpd[] = -+{ -+ MASKTOVEC(0, 0, 0, 0), -+ MASKTOVEC(0, 0, 0, 1), -+ MASKTOVEC(0, 0, 1, 0), -+ MASKTOVEC(0, 0, 1, 1), -+ MASKTOVEC(0, 1, 0, 0), -+ MASKTOVEC(0, 1, 0, 1), -+ MASKTOVEC(0, 1, 1, 0), -+ MASKTOVEC(0, 1, 1, 1), -+ MASKTOVEC(1, 0, 0, 0), -+ MASKTOVEC(1, 0, 0, 1), -+ MASKTOVEC(1, 0, 1, 0), -+ MASKTOVEC(1, 0, 1, 1), -+ MASKTOVEC(1, 1, 0, 0), -+ MASKTOVEC(1, 1, 0, 1), -+ MASKTOVEC(1, 1, 1, 0), -+ MASKTOVEC(1, 1, 1, 1), -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief rasterize a raster tile partially covered by the triangle -+/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile -+/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) -+/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. -+/// Used to step between quads when sweeping over the raster tile. -+INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, __m256d vEdge0, __m256d vEdge1, __m256d vEdge2, -+ __m128i &vA, __m128i &vB, __m256d &vStepQuad0, __m256d &vStepQuad1, __m256d &vStepQuad2) -+{ -+ uint64_t coverageMask = 0; -+ -+ // Step to the pixel sample locations of the 1st quad -+ double edge0; -+ double edge1; -+ double edge2; -+ _mm_store_sd(&edge0, _mm256_castpd256_pd128(vEdge0)); -+ _mm_store_sd(&edge1, _mm256_castpd256_pd128(vEdge1)); -+ _mm_store_sd(&edge2, _mm256_castpd256_pd128(vEdge2)); -+ -+ vEdge0 = _mm256_broadcast_sd(&edge0); -+ vEdge1 = _mm256_broadcast_sd(&edge1); -+ vEdge2 = _mm256_broadcast_sd(&edge2); -+ -+ vEdge0 = _mm256_add_pd(vEdge0, vStepQuad0); -+ vEdge1 = _mm256_add_pd(vEdge1, vStepQuad1); -+ vEdge2 = _mm256_add_pd(vEdge2, vStepQuad2); -+ -+ // compute step to next quad (mul by 2 in x and y direction) -+ __m256d vAEdge0 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 0, 0, 0))); -+ __m256d vAEdge1 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vA, _MM_SHUFFLE(1, 1, 1, 1))); -+ __m256d vAEdge2 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vA, _MM_SHUFFLE(2, 2, 2, 2))); -+ __m256d vBEdge0 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 0, 0, 0))); -+ __m256d vBEdge1 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vB, _MM_SHUFFLE(1, 1, 1, 1))); -+ __m256d vBEdge2 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vB, _MM_SHUFFLE(2, 2, 2, 2))); -+ -+ __m256d vStep0X = _mm256_mul_pd(vAEdge0, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); -+ __m256d vStep0Y = _mm256_mul_pd(vBEdge0, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); -+ -+ __m256d vStep1X = _mm256_mul_pd(vAEdge1, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); -+ __m256d vStep1Y = _mm256_mul_pd(vBEdge1, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); -+ -+ __m256d vStep2X = _mm256_mul_pd(vAEdge2, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); -+ __m256d vStep2Y = _mm256_mul_pd(vBEdge2, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); -+ -+ // fast unrolled version for 8x8 tile -+#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 -+ int mask0, mask1, mask2; -+ uint64_t mask; -+ -+ // evaluate which pixels in the quad are covered -+#define EVAL \ -+ mask0 = _mm256_movemask_pd(vEdge0);\ -+ mask1 = _mm256_movemask_pd(vEdge1);\ -+ mask2 = _mm256_movemask_pd(vEdge2); -+ -+ // update coverage mask -+#define UPDATE_MASK(bit) \ -+ mask = mask0 & mask1 & mask2;\ -+ coverageMask |= (mask << bit); -+ -+ // step in the +x direction to the next quad -+#define INCX \ -+ vEdge0 = _mm256_add_pd(vEdge0, vStep0X);\ -+ vEdge1 = _mm256_add_pd(vEdge1, vStep1X);\ -+ vEdge2 = _mm256_add_pd(vEdge2, vStep2X); -+ // step in the +y direction to the next quad -+#define INCY \ -+ vEdge0 = _mm256_add_pd(vEdge0, vStep0Y);\ -+ vEdge1 = _mm256_add_pd(vEdge1, vStep1Y);\ -+ vEdge2 = _mm256_add_pd(vEdge2, vStep2Y); -+ // step in the -x direction to the next quad -+#define DECX \ -+ vEdge0 = _mm256_sub_pd(vEdge0, vStep0X);\ -+ vEdge1 = _mm256_sub_pd(vEdge1, vStep1X);\ -+ vEdge2 = _mm256_sub_pd(vEdge2, vStep2X); -+ -+ // sweep 2x2 quad back and forth through the raster tile, -+ // computing coverage masks for the entire tile -+ -+ // raster tile -+ // 0 1 2 3 4 5 6 7 -+ // x x -+ // x x ------------------> -+ // x x | -+ // <-----------------x x V -+ // .. -+ -+ // row 0 -+ EVAL; -+ UPDATE_MASK(0); -+ INCX; -+ EVAL; -+ UPDATE_MASK(4); -+ INCX; -+ EVAL; -+ UPDATE_MASK(8); -+ INCX; -+ EVAL; -+ UPDATE_MASK(12); -+ INCY; -+ -+ //row 1 -+ EVAL; -+ UPDATE_MASK(28); -+ DECX; -+ EVAL; -+ UPDATE_MASK(24); -+ DECX; -+ EVAL; -+ UPDATE_MASK(20); -+ DECX; -+ EVAL; -+ UPDATE_MASK(16); -+ INCY; -+ -+ // row 2 -+ EVAL; -+ UPDATE_MASK(32); -+ INCX; -+ EVAL; -+ UPDATE_MASK(36); -+ INCX; -+ EVAL; -+ UPDATE_MASK(40); -+ INCX; -+ EVAL; -+ UPDATE_MASK(44); -+ INCY; -+ -+ // row 3 -+ EVAL; -+ UPDATE_MASK(60); -+ DECX; -+ EVAL; -+ UPDATE_MASK(56); -+ DECX; -+ EVAL; -+ UPDATE_MASK(52); -+ DECX; -+ EVAL; -+ UPDATE_MASK(48); -+#else -+ uint32_t bit = 0; -+ for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y) -+ { -+ __m256d vStartOfRowEdge0 = vEdge0; -+ __m256d vStartOfRowEdge1 = vEdge1; -+ __m256d vStartOfRowEdge2 = vEdge2; -+ -+ for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x) -+ { -+ int mask0 = _mm256_movemask_pd(vEdge0); -+ int mask1 = _mm256_movemask_pd(vEdge1); -+ int mask2 = _mm256_movemask_pd(vEdge2); -+ -+ uint64_t mask = mask0 & mask1 & mask2; -+ coverageMask |= (mask << bit); -+ -+ // step to the next pixel in the x -+ vEdge0 = _mm256_add_pd(vEdge0, vStep0X); -+ vEdge1 = _mm256_add_pd(vEdge1, vStep1X); -+ vEdge2 = _mm256_add_pd(vEdge2, vStep2X); -+ bit+=4; -+ } -+ -+ // step to the next row -+ vEdge0 = _mm256_add_pd(vStartOfRowEdge0, vStep0Y); -+ vEdge1 = _mm256_add_pd(vStartOfRowEdge1, vStep1Y); -+ vEdge2 = _mm256_add_pd(vStartOfRowEdge2, vStep2Y); -+ } -+#endif -+ return coverageMask; -+ -+} -+// Top left rule: -+// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge -+// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge -+// Top left: a sample is in if it is a top or left edge. -+// Out: !(horizontal && above) = !horizontal && below -+// Out: !horizontal && left = !(!horizontal && left) = horizontal and right -+INLINE __m256d adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, const __m256d vEdge) -+{ -+ // if vA < 0, vC-- -+ // if vA == 0 && vB < 0, vC-- -+ -+ __m256d vEdgeOut = vEdge; -+ __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); -+ -+ // if vA < 0 (line is not horizontal and below) -+ int msk = _mm_movemask_ps(_mm_castsi128_ps(vA)); -+ -+ // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) -+ __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); -+ int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); -+ msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); -+ -+ // if either of these are true and we're on the line (edge == 0), bump it outside the line -+ vEdgeOut = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); -+ return vEdgeOut; -+} -+ -+// max(abs(dz/dx), abs(dz,dy) -+INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) -+{ -+ /* -+ // evaluate i,j at (0,0) -+ float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; -+ float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; -+ -+ // evaluate i,j at (1,0) -+ float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; -+ float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; -+ -+ // compute dz/dx -+ float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2]; -+ float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2]; -+ float dzdx = abs(d10 - d00); -+ -+ // evaluate i,j at (0,1) -+ float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2]; -+ float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2]; -+ -+ float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2]; -+ float dzdy = abs(d01 - d00); -+ */ -+ -+ // optimized version of above -+ float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0])); -+ float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1])); -+ -+ return std::max(dzdx, dzdy); -+} -+ -+INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) -+{ -+ if (pState->depthFormat == R24_UNORM_X8_TYPELESS) -+ { -+ return (1.0f / (1 << 24)); -+ } -+ else if (pState->depthFormat == R16_UNORM) -+ { -+ return (1.0f / (1 << 16)); -+ } -+ else -+ { -+ SWR_ASSERT(pState->depthFormat == R32_FLOAT); -+ -+ // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) -+ float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); -+ uint32_t zMaxInt = *(uint32_t*)&zMax; -+ zMaxInt &= 0x7f800000; -+ zMax = *(float*)&zMaxInt; -+ -+ return zMax * (1.0f / (1 << 23)); -+ } -+} -+ -+INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) -+{ -+ if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) -+ { -+ return 0.0f; -+ } -+ -+ float scale = pState->slopeScaledDepthBias; -+ if (scale != 0.0f) -+ { -+ scale *= ComputeMaxDepthSlope(pTri); -+ } -+ -+ float bias = pState->depthBias * ComputeBiasFactor(pState, pTri, z) + scale; -+ if (pState->depthBiasClamp > 0.0f) -+ { -+ bias = std::min(bias, pState->depthBiasClamp); -+ } -+ else if (pState->depthBiasClamp < 0.0f) -+ { -+ bias = std::max(bias, pState->depthBiasClamp); -+ } -+ -+ return bias; -+} -+ -+// Prevent DCE by writing coverage mask from rasterizer to volatile -+#if KNOB_ENABLE_TOSS_POINTS -+__declspec(thread) volatile uint64_t gToss; -+#endif -+ -+static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; -+// try to avoid _chkstk insertions; make this thread local -+static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib]; -+ -+template -+void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) -+{ -+ -+ const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc); -+#if KNOB_ENABLE_TOSS_POINTS -+ if (KNOB_TOSS_BIN_TRIS) -+ { -+ return; -+ } -+#endif -+ RDTSC_START(BERasterizeTriangle); -+ -+ RDTSC_START(BETriangleSetup); -+ const API_STATE &state = GetApiState(pDC); -+ const SWR_RASTSTATE &rastState = state.rastState; -+ -+ OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; -+ triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; -+ -+ __m128 vX, vY, vZ, vRecipW; -+ -+ // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care -+ // eg: vX = [x0 x1 x2 dc] -+ vX = _mm_load_ps(workDesc.pTriBuffer); -+ vY = _mm_load_ps(workDesc.pTriBuffer + 4); -+ vZ = _mm_load_ps(workDesc.pTriBuffer + 8); -+ vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); -+ -+ // convert to fixed point -+ __m128i vXi = fpToFixedPoint(vX); -+ __m128i vYi = fpToFixedPoint(vY); -+ -+ // quantize floating point position to fixed point precision -+ // to prevent attribute creep around the triangle vertices -+ vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); -+ vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); -+ -+ // triangle setup - A and B edge equation coefs -+ __m128 vA, vB; -+ triangleSetupAB(vX, vY, vA, vB); -+ -+ __m128i vAi, vBi; -+ triangleSetupABInt(vXi, vYi, vAi, vBi); -+ -+ // determinant -+ float det = calcDeterminantInt(vAi, vBi); -+ -+ /// @todo: This test is flipped...we have a stray '-' sign somewhere -+ // Convert CW triangles to CCW -+ if (det > 0.0) -+ { -+ vA = _mm_mul_ps(vA, _mm_set1_ps(-1)); -+ vB = _mm_mul_ps(vB, _mm_set1_ps(-1)); -+ vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1)); -+ vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1)); -+ det = -det; -+ } -+ -+ __m128 vC; -+ // Finish triangle setup - C edge coef -+ triangleSetupC(vX, vY, vA, vB, vC); -+ -+ // compute barycentric i and j -+ // i = (A1x + B1y + C1)/det -+ // j = (A2x + B2y + C2)/det -+ __m128 vDet = _mm_set1_ps(det); -+ __m128 vRecipDet = _mm_div_ps(_mm_set1_ps(1.0f), vDet);//_mm_rcp_ps(vDet); -+ _mm_store_ss(&triDesc.recipDet, vRecipDet); -+ -+ // only extract coefs for 2 of the barycentrics; the 3rd can be -+ // determined from the barycentric equation: -+ // i + j + k = 1 <=> k = 1 - j - i -+ _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); -+ _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1); -+ _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1); -+ _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2); -+ _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); -+ _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); -+ -+ OSALIGN(float, 16) oneOverW[4]; -+ _mm_store_ps(oneOverW, vRecipW); -+ triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; -+ triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; -+ triDesc.OneOverW[2] = oneOverW[2]; -+ -+ // calculate perspective correct coefs per vertex attrib -+ float* pPerspAttribs = perspAttribsTLS; -+ float* pAttribs = workDesc.pAttribs; -+ triDesc.pPerspAttribs = pPerspAttribs; -+ triDesc.pAttribs = pAttribs; -+ float *pRecipW = workDesc.pTriBuffer + 12; -+ __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); -+ __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1); -+ __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1); -+ for(uint32_t i = 0; i < workDesc.numAttribs; i++) -+ { -+ __m128 attribA = _mm_load_ps(pAttribs); -+ __m128 attribB = _mm_load_ps(pAttribs+=4); -+ __m128 attribC = _mm_load_ps(pAttribs+=4); -+ pAttribs+=4; -+ -+ attribA = _mm_mul_ps(attribA, vOneOverWV0); -+ attribB = _mm_mul_ps(attribB, vOneOverWV1); -+ attribC = _mm_mul_ps(attribC, vOneOverWV2); -+ -+ _mm_store_ps(pPerspAttribs, attribA); -+ _mm_store_ps(pPerspAttribs+=4, attribB); -+ _mm_store_ps(pPerspAttribs+=4, attribC); -+ pPerspAttribs+=4; -+ } -+ -+ // compute bary Z -+ // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) -+ OSALIGN(float, 16) a[4]; -+ _mm_store_ps(a, vZ); -+ triDesc.Z[0] = a[0] - a[2]; -+ triDesc.Z[1] = a[1] - a[2]; -+ triDesc.Z[2] = a[2]; -+ -+ // add depth bias -+ triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); -+ -+ // broadcast A and B coefs for each edge to all slots -+ __m128i vAEdge0h = _mm_shuffle_epi32(vAi, _MM_SHUFFLE(0,0,0,0)); -+ __m128i vAEdge1h = _mm_shuffle_epi32(vAi, _MM_SHUFFLE(1,1,1,1)); -+ __m128i vAEdge2h = _mm_shuffle_epi32(vAi, _MM_SHUFFLE(2,2,2,2)); -+ __m128i vBEdge0h = _mm_shuffle_epi32(vBi, _MM_SHUFFLE(0,0,0,0)); -+ __m128i vBEdge1h = _mm_shuffle_epi32(vBi, _MM_SHUFFLE(1,1,1,1)); -+ __m128i vBEdge2h = _mm_shuffle_epi32(vBi, _MM_SHUFFLE(2,2,2,2)); -+ -+ __m256d vAEdge0Fix8 = _mm256_cvtepi32_pd(vAEdge0h); -+ __m256d vAEdge1Fix8 = _mm256_cvtepi32_pd(vAEdge1h); -+ __m256d vAEdge2Fix8 = _mm256_cvtepi32_pd(vAEdge2h); -+ __m256d vBEdge0Fix8 = _mm256_cvtepi32_pd(vBEdge0h); -+ __m256d vBEdge1Fix8 = _mm256_cvtepi32_pd(vBEdge1h); -+ __m256d vBEdge2Fix8 = _mm256_cvtepi32_pd(vBEdge2h); -+ -+ // Precompute pixel quad step offsets -+ // 0,0 ------ 1,0 -+ // | | -+ // | | -+ // 1,0 ------ 1,1 -+ const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0); -+ const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0); -+ -+ // Evaluate edge equations at 4 upper left corners of a 2x2 pixel quad -+ // used to step between quads while sweeping over a raster tile -+ __m256d vQuadStepX0Fix16 = _mm256_mul_pd(vAEdge0Fix8, vQuadOffsetsXIntFix8); -+ __m256d vQuadStepX1Fix16 = _mm256_mul_pd(vAEdge1Fix8, vQuadOffsetsXIntFix8); -+ __m256d vQuadStepX2Fix16 = _mm256_mul_pd(vAEdge2Fix8, vQuadOffsetsXIntFix8); -+ -+ __m256d vQuadStepY0Fix16 = _mm256_mul_pd(vBEdge0Fix8, vQuadOffsetsYIntFix8); -+ __m256d vQuadStepY1Fix16 = _mm256_mul_pd(vBEdge1Fix8, vQuadOffsetsYIntFix8); -+ __m256d vQuadStepY2Fix16 = _mm256_mul_pd(vBEdge2Fix8, vQuadOffsetsYIntFix8); -+ -+ // vStepQuad = A*vQuadOffsetsXInt + B*vQuadOffsetsYInt -+ __m256d vStepQuad0Fix16 = _mm256_add_pd(vQuadStepX0Fix16, vQuadStepY0Fix16); -+ __m256d vStepQuad1Fix16 = _mm256_add_pd(vQuadStepX1Fix16, vQuadStepY1Fix16); -+ __m256d vStepQuad2Fix16 = _mm256_add_pd(vQuadStepX2Fix16, vQuadStepY2Fix16); -+ -+ // Precompute tile step offsets -+ // 0,0 ------ KNOB_TILE_X_DIM-1,0 -+ // | | -+ // | | -+ // KNOB_TILE_Y_DIM-1,0 ------ KNOB_TILE_X_DIM-1,KNOB_TILE_Y_DIM-1 -+ const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM-1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM-1)*FIXED_POINT_SCALE, 0); -+ const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM-1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM-1)*FIXED_POINT_SCALE, 0, 0); -+ -+ // Calc bounding box of triangle -+ OSALIGN(BBOX, 16) bbox; -+ calcBoundingBoxInt(vXi, vYi, bbox); -+ -+ // Intersect with scissor/viewport -+ bbox.left = std::max(bbox.left, state.scissorInFixedPoint.left); -+ bbox.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right); -+ bbox.top = std::max(bbox.top, state.scissorInFixedPoint.top); -+ bbox.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom); -+ -+ triDesc.triFlags = workDesc.triFlags; -+ -+ // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox -+ uint32_t macroX, macroY; -+ MacroTileMgr::getTileIndices(macroTile, macroX, macroY); -+ int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; -+ int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; -+ int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; -+ int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; -+ -+ OSALIGN(BBOX, 16) intersect; -+ intersect.left = std::max(bbox.left, macroBoxLeft); -+ intersect.top = std::max(bbox.top, macroBoxTop); -+ intersect.right = std::min(bbox.right, macroBoxRight); -+ intersect.bottom = std::min(bbox.bottom, macroBoxBottom); -+ -+ SWR_ASSERT(intersect.left <= intersect.right && intersect.top <= intersect.bottom && intersect.left >= 0 && intersect.right >= 0 && intersect.top >= 0 && intersect.bottom >= 0); -+ -+ RDTSC_STOP(BETriangleSetup, 0, pDC->drawId); -+ -+ // update triangle desc -+ uint32_t tileX = intersect.left >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); -+ uint32_t tileY = intersect.top >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); -+ uint32_t maxTileX = intersect.right >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); -+ uint32_t maxTileY = intersect.bottom >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); -+ uint32_t numTilesX = maxTileX - tileX + 1; -+ uint32_t numTilesY = maxTileY - tileY + 1; -+ -+ if (numTilesX == 0 || numTilesY == 0) -+ { -+ RDTSC_EVENT(BEEmptyTriangle, 1, 0); -+ RDTSC_STOP(BERasterizeTriangle, 1, 0); -+ return; -+ } -+ -+ RDTSC_START(BEStepSetup); -+ -+ // Step to pixel center of top-left pixel of the triangle bbox -+ // Align intersect bbox (top/left) to raster tile's (top/left). -+ int32_t x = AlignDown(intersect.left, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); -+ int32_t y = AlignDown(intersect.top, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); -+ -+ if(sampleCount == SWR_MULTISAMPLE_1X) -+ { -+ // Add 0.5, in fixed point, to offset to pixel center -+ x += (FIXED_POINT_SCALE / 2); -+ y += (FIXED_POINT_SCALE / 2); -+ } -+ -+ __m128i vTopLeftX = _mm_set1_epi32(x); -+ __m128i vTopLeftY = _mm_set1_epi32(y); -+ -+ // evaluate edge equations at top-left pixel using 64bit math -+ // all other evaluations will be 32bit steps from it -+ // small triangles could skip this and do all 32bit math -+ // edge 0 -+ // -+ // line = Ax + By + C -+ // solving for C: -+ // C = -Ax - By -+ // we know x0 and y0 are on the line; plug them in: -+ // C = -Ax0 - By0 -+ // plug C back into line equation: -+ // line = Ax - Bx - Ax0 - Bx1 -+ // line = A(x - x0) + B(y - y0) -+ // line = A(x0+dX) + B(y0+dY) + C = Ax0 + AdX + By0 + BdY + c = AdX + BdY -+ -+ // edge 0 and 1 -+ // edge0 = A0(x - x0) + B0(y - y0) -+ // edge1 = A1(x - x1) + B1(y - y1) -+ __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); -+ __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); -+ -+ __m256d vEdgeFix16[3]; -+ -+ // evaluate A(dx) and B(dY) for all points -+ __m256d vAipd = _mm256_cvtepi32_pd(vAi); -+ __m256d vBipd = _mm256_cvtepi32_pd(vBi); -+ __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); -+ __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); -+ -+ __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); -+ __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); -+ __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); -+ -+ // adjust for top-left rule -+ vEdge = adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); -+ -+ // broadcast respective edge results to all lanes -+ double* pEdge = (double*)&vEdge; -+ vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); -+ vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); -+ vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); -+ -+ // compute step to the next tile -+ __m256d vNextXTileFix8 = _mm256_set1_pd(KNOB_TILE_X_DIM * FIXED_POINT_SCALE); -+ __m256d vNextYTileFix8 = _mm256_set1_pd(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE); -+ __m256d vTileStepX0Fix16 = _mm256_mul_pd(vAEdge0Fix8, vNextXTileFix8); -+ __m256d vTileStepY0Fix16 = _mm256_mul_pd(vBEdge0Fix8, vNextYTileFix8); -+ __m256d vTileStepX1Fix16 = _mm256_mul_pd(vAEdge1Fix8, vNextXTileFix8); -+ __m256d vTileStepY1Fix16 = _mm256_mul_pd(vBEdge1Fix8, vNextYTileFix8); -+ __m256d vTileStepX2Fix16 = _mm256_mul_pd(vAEdge2Fix8, vNextXTileFix8); -+ __m256d vTileStepY2Fix16 = _mm256_mul_pd(vBEdge2Fix8, vNextYTileFix8); -+ -+ // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile -+ // used to for testing if entire raster tile is inside a triangle -+ __m256d vResultAxFix16 = _mm256_mul_pd(vAEdge0Fix8, vTileOffsetsXIntFix8); -+ __m256d vResultByFix16 = _mm256_mul_pd(vBEdge0Fix8, vTileOffsetsYIntFix8); -+ vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], _mm256_add_pd(vResultAxFix16, vResultByFix16)); -+ -+ vResultAxFix16 = _mm256_mul_pd(vAEdge1Fix8, vTileOffsetsXIntFix8); -+ vResultByFix16 = _mm256_mul_pd(vBEdge1Fix8, vTileOffsetsYIntFix8); -+ vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], _mm256_add_pd(vResultAxFix16, vResultByFix16)); -+ -+ vResultAxFix16 = _mm256_mul_pd(vAEdge2Fix8, vTileOffsetsXIntFix8); -+ vResultByFix16 = _mm256_mul_pd(vBEdge2Fix8, vTileOffsetsYIntFix8); -+ vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], _mm256_add_pd(vResultAxFix16, vResultByFix16)); -+ -+ // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox -+ // step sample positions to the raster tile bbox of multisample points -+ // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples) -+ // | | -+ // | | -+ // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) -+ __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox; -+ if(sampleCount > SWR_MULTISAMPLE_1X) -+ { -+ __m128i vTileSampleBBoxXh = MultisampleTraits::TileSampleOffsetsX(); -+ __m128i vTileSampleBBoxYh = MultisampleTraits::TileSampleOffsetsY(); -+ -+ __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); -+ __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); -+ -+ // step edge equation tests from Tile -+ // used to for testing if entire raster tile is inside a triangle -+ vResultAxFix16 = _mm256_mul_pd(vAEdge0Fix8, vTileSampleBBoxXFix8); -+ vResultByFix16 = _mm256_mul_pd(vBEdge0Fix8, vTileSampleBBoxYFix8); -+ vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); -+ -+ vResultAxFix16 = _mm256_mul_pd(vAEdge1Fix8, vTileSampleBBoxXFix8); -+ vResultByFix16 = _mm256_mul_pd(vBEdge1Fix8, vTileSampleBBoxYFix8); -+ vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); -+ -+ vResultAxFix16 = _mm256_mul_pd(vAEdge2Fix8, vTileSampleBBoxXFix8); -+ vResultByFix16 = _mm256_mul_pd(vBEdge2Fix8, vTileSampleBBoxYFix8); -+ vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); -+ } -+ -+ RDTSC_STOP(BEStepSetup, 0, pDC->drawId); -+ -+ uint32_t tY = tileY; -+ uint32_t tX = tileX; -+ uint32_t maxY = maxTileY; -+ uint32_t maxX = maxTileX; -+ -+ triDesc.pSamplePos = pDC->pState->state.samplePos; -+ -+ // compute steps between raster tiles for render output buffers -+ static const uint32_t colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MultisampleTraits::numSamples}; -+ static const uint32_t colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep}; -+ static const uint32_t depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MultisampleTraits::numSamples}; -+ static const uint32_t depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep}; -+ static const uint32_t stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MultisampleTraits::numSamples}; -+ static const uint32_t stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep}; -+ RenderOutputBuffers renderBuffers, currentRenderBufferRow; -+ -+ GetRenderHotTiles(pDC, macroTile, tileX, tileY, renderBuffers, MultisampleTraits::numSamples, -+ triDesc.triFlags.renderTargetArrayIndex); -+ currentRenderBufferRow = renderBuffers; -+ -+ // rasterize and generate coverage masks per sample -+ uint32_t maxSamples = MultisampleTraits::numSamples; -+ for (uint32_t tileY = tY; tileY <= maxY; ++tileY) -+ { -+ __m256d vStartOfRowEdge0 = vEdgeFix16[0]; -+ __m256d vStartOfRowEdge1 = vEdgeFix16[1]; -+ __m256d vStartOfRowEdge2 = vEdgeFix16[2]; -+ -+ for (uint32_t tileX = tX; tileX <= maxX; ++tileX) -+ { -+ uint64_t anyCoveredSamples = 0; -+ -+ // is the corner of the edge outside of the raster tile? (vEdge < 0) -+ int mask0, mask1, mask2; -+ if(sampleCount == SWR_MULTISAMPLE_1X) -+ { -+ // is the corner of the edge outside of the raster tile? (vEdge < 0) -+ mask0 = _mm256_movemask_pd(vEdgeFix16[0]); -+ mask1 = _mm256_movemask_pd(vEdgeFix16[1]); -+ mask2 = _mm256_movemask_pd(vEdgeFix16[2]); -+ } -+ else -+ { -+ __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; -+ // evaluate edge equations at the tile multisample bounding box -+ vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]); -+ vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]); -+ vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]); -+ mask0 = _mm256_movemask_pd(vSampleBboxTest0); -+ mask1 = _mm256_movemask_pd(vSampleBboxTest1); -+ mask2 = _mm256_movemask_pd(vSampleBboxTest2); -+ } -+ -+ for (uint32_t sampleNum = 0; sampleNum < maxSamples; sampleNum++) -+ { -+ // trivial reject, at least one edge has all 4 corners of raster tile outside -+ bool trivialReject = (!(mask0 && mask1 && mask2)) ? true : false; -+ -+ if (!trivialReject) -+ { -+ // trivial accept mask -+ triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; -+ if ((mask0 & mask1 & mask2) == 0xf) -+ { -+ anyCoveredSamples = triDesc.coverageMask[sampleNum]; -+ // trivial accept, all 4 corners of all 3 edges are negative -+ // i.e. raster tile completely inside triangle -+ RDTSC_EVENT(BETrivialAccept, 1, 0); -+ } -+ else -+ { -+ __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; -+ if(sampleCount == SWR_MULTISAMPLE_1X) -+ { -+ // should get optimized out for single sample case (global value numbering or copy propagation) -+ vEdge0AtSample = vEdgeFix16[0]; -+ vEdge1AtSample = vEdgeFix16[1]; -+ vEdge2AtSample = vEdgeFix16[2]; -+ } -+ else -+ { -+ __m128i vSampleOffsetXh = MultisampleTraits::vXi(sampleNum); -+ __m128i vSampleOffsetYh = MultisampleTraits::vYi(sampleNum); -+ __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); -+ __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); -+ -+ // *note*: none of this needs to be vectorized as rasterizePartialTile just takes vEdge[0] -+ // for each edge and broadcasts it before offsetting to individual pixel quads -+ -+ // step edge equation tests from UL tile corner to pixel sample position -+ vResultAxFix16 = _mm256_mul_pd(vAEdge0Fix8, vSampleOffsetX); -+ vResultByFix16 = _mm256_mul_pd(vBEdge0Fix8, vSampleOffsetY); -+ vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); -+ vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample); -+ -+ vResultAxFix16 = _mm256_mul_pd(vAEdge1Fix8, vSampleOffsetX); -+ vResultByFix16 = _mm256_mul_pd(vBEdge1Fix8, vSampleOffsetY); -+ vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); -+ vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample); -+ -+ vResultAxFix16 = _mm256_mul_pd(vAEdge2Fix8, vSampleOffsetX); -+ vResultByFix16 = _mm256_mul_pd(vBEdge2Fix8, vSampleOffsetY); -+ vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); -+ vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample); -+ } -+ -+ // not trivial accept or reject, must rasterize full tile -+ RDTSC_START(BERasterizePartial); -+ triDesc.coverageMask[sampleNum] = rasterizePartialTile(pDC, vEdge0AtSample, vEdge1AtSample, vEdge2AtSample, -+ vAi, vBi, vStepQuad0Fix16, vStepQuad1Fix16, vStepQuad2Fix16); -+ RDTSC_STOP(BERasterizePartial, 0, 0); -+ -+ anyCoveredSamples |= triDesc.coverageMask[sampleNum]; -+ } -+ } -+ else -+ { -+ if(sampleCount > SWR_MULTISAMPLE_1X) -+ { -+ triDesc.coverageMask[sampleNum] = 0; -+ } -+ RDTSC_EVENT(BETrivialReject, 1, 0); -+ } -+ } -+ -+#if KNOB_ENABLE_TOSS_POINTS -+ if(KNOB_TOSS_RS) -+ { -+ gToss = triDesc.coverageMask[0]; -+ } -+ else -+#endif -+ if(anyCoveredSamples) -+ { -+ RDTSC_START(BEPixelBackend); -+ pDC->pState->pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); -+ RDTSC_STOP(BEPixelBackend, 0, 0); -+ } -+ -+ // step to the next tile in X -+ vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], vTileStepX0Fix16); -+ vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], vTileStepX1Fix16); -+ vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], vTileStepX2Fix16); -+ -+ StepRasterTileX(state.psState.maxRTSlotUsed, renderBuffers, colorRasterTileStep, depthRasterTileStep, stencilRasterTileStep); -+ } -+ -+ // step to the next tile in Y -+ vEdgeFix16[0] = _mm256_add_pd(vStartOfRowEdge0, vTileStepY0Fix16); -+ vEdgeFix16[1] = _mm256_add_pd(vStartOfRowEdge1, vTileStepY1Fix16); -+ vEdgeFix16[2] = _mm256_add_pd(vStartOfRowEdge2, vTileStepY2Fix16); -+ -+ StepRasterTileY(state.psState.maxRTSlotUsed, renderBuffers, currentRenderBufferRow, colorRasterTileRowStep, depthRasterTileRowStep, stencilRasterTileRowStep); -+ } -+ -+ RDTSC_STOP(BERasterizeTriangle, 1, 0); -+} -+ -+void RasterizePoint(DRAW_CONTEXT *pDC, uint32_t workerId, const TRIANGLE_WORK_DESC &workDesc, uint32_t macroTile) -+{ -+#if KNOB_ENABLE_TOSS_POINTS -+ if (KNOB_TOSS_BIN_TRIS) -+ { -+ return; -+ } -+#endif -+ -+ // map x,y relative offsets from start of raster tile to bit position in -+ // coverage mask for the point -+ static const uint32_t coverageMap[8][8] = { -+ { 0, 1, 4, 5, 8, 9, 12, 13 }, -+ { 2, 3, 6, 7, 10, 11, 14, 15 }, -+ { 16, 17, 20, 21, 24, 25, 28, 29 }, -+ { 18, 19, 22, 23, 26, 27, 30, 31 }, -+ { 32, 33, 36, 37, 40, 41, 44, 45 }, -+ { 34, 35, 38, 39, 42, 43, 46, 47 }, -+ { 48, 49, 52, 53, 56, 57, 60, 61 }, -+ { 50, 51, 54, 55, 58, 59, 62, 63 } -+ }; -+ -+ OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; -+ -+ // pull point information from triangle buffer -+ // @todo use structs for readability -+ uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer; -+ uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1); -+ float z = *(workDesc.pTriBuffer + 2); -+ -+ // construct triangle descriptor for point -+ // no interpolation, set up i,j for constant interpolation of z and attribs -+ // @todo implement an optimized backend that doesn't require triangle information -+ -+ // compute coverage mask from x,y packed into the coverageMask flag -+ // mask indices by the maximum valid index for x/y of coveragemap. -+ uint32_t tX = workDesc.triFlags.coverageMask & 0x7; -+ uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7; -+ // todo: multisample points? -+ triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX]; -+ -+ // no persp divide needed for points -+ triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs; -+ triDesc.triFlags = workDesc.triFlags; -+ triDesc.recipDet = 1.0f; -+ triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f; -+ triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f; -+ triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f; -+ triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z; -+ -+ RenderOutputBuffers renderBuffers; -+ GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, -+ renderBuffers, 1, triDesc.triFlags.renderTargetArrayIndex); -+ -+ RDTSC_START(BEPixelBackend); -+ pDC->pState->pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers); -+ RDTSC_STOP(BEPixelBackend, 0, 0); -+} -+ -+void rastPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) -+{ -+ TRIANGLE_WORK_DESC *pDesc = (TRIANGLE_WORK_DESC*)pData; -+ RasterizePoint(pDC, workerId, *pDesc, macroTile); -+ -+} -+// Get pointers to hot tile memory for color RT, depth, stencil -+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, -+ uint32_t numSamples, uint32_t renderTargetArrayIndex) -+{ -+ const API_STATE& state = GetApiState(pDC); -+ SWR_CONTEXT *pContext = pDC->pContext; -+ const SWR_DEPTH_STENCIL_STATE *pDSState = &state.depthStencilState; -+ const uint32_t MaxRT = state.psState.maxRTSlotUsed; -+ -+ uint32_t mx, my; -+ MacroTileMgr::getTileIndices(macroID, mx, my); -+ tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx; -+ tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my; -+ -+ if(state.psState.pfnPixelShader != NULL) -+ { -+ // compute tile offset for active hottile buffers -+ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; -+ uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); -+ offset*=numSamples; -+ for(uint32_t rt = 0; rt <= MaxRT; ++rt) -+ { -+ HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, -+ numSamples, renderTargetArrayIndex); -+ pColor->state = HOTTILE_DIRTY; -+ renderBuffers.pColor[rt] = pColor->pBuffer + offset; -+ } -+ } -+ if(pDSState->depthTestEnable || pDSState->depthWriteEnable) -+ { -+ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; -+ uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); -+ offset*=numSamples; -+ HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, -+ numSamples, renderTargetArrayIndex); -+ pDepth->state = HOTTILE_DIRTY; -+ SWR_ASSERT(pDepth->pBuffer != nullptr); -+ renderBuffers.pDepth = pDepth->pBuffer + offset; -+ } -+ if(pDSState->stencilTestEnable) -+ { -+ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; -+ uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); -+ offset*=numSamples; -+ HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, -+ numSamples, renderTargetArrayIndex); -+ pStencil->state = HOTTILE_DIRTY; -+ SWR_ASSERT(pStencil->pBuffer != nullptr); -+ renderBuffers.pStencil = pStencil->pBuffer + offset; -+ } -+} -+ -+INLINE -+void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep) -+{ -+ for(uint32_t rt = 0; rt <= MaxRT; ++rt) -+ { -+ buffers.pColor[rt] += colorTileStep; -+ } -+ -+ buffers.pDepth += depthTileStep; -+ buffers.pStencil += stencilTileStep; -+} -+ -+INLINE -+void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep) -+{ -+ for(uint32_t rt = 0; rt <= MaxRT; ++rt) -+ { -+ startBufferRow.pColor[rt] += colorRowStep; -+ buffers.pColor[rt] = startBufferRow.pColor[rt]; -+ } -+ startBufferRow.pDepth += depthRowStep; -+ buffers.pDepth = startBufferRow.pDepth; -+ -+ startBufferRow.pStencil += stencilRowStep; -+ buffers.pStencil = startBufferRow.pStencil; -+} -+ -+// initialize rasterizer function table -+PFN_WORK_FUNC gRasterizerTable[SWR_MULTISAMPLE_TYPE_MAX] = -+{ -+ RasterizeTriangle, -+ RasterizeTriangle, -+ RasterizeTriangle, -+ RasterizeTriangle, -+ RasterizeTriangle -+}; -+ -+void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) -+{ -+ const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData); -+#if KNOB_ENABLE_TOSS_POINTS -+ if (KNOB_TOSS_BIN_TRIS) -+ { -+ return; -+ } -+#endif -+ -+ // bloat line to two tris and call the triangle rasterizer twice -+ RDTSC_START(BERasterizeLine); -+ -+ const API_STATE &state = GetApiState(pDC); -+ -+ // macrotile dimensioning -+ uint32_t macroX, macroY; -+ MacroTileMgr::getTileIndices(macroTile, macroX, macroY); -+ int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; -+ int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; -+ int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; -+ int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; -+ -+ // create a copy of the triangle buffer to write our adjusted vertices to -+ OSALIGNSIMD(float) newTriBuffer[4 * 4]; -+ TRIANGLE_WORK_DESC newWorkDesc = workDesc; -+ newWorkDesc.pTriBuffer = &newTriBuffer[0]; -+ -+ // create a copy of the attrib buffer to write our adjusted attribs to -+ OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES]; -+ newWorkDesc.pAttribs = &newAttribBuffer[0]; -+ -+ const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f); -+ const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f); -+ -+ __m128 vX, vY, vZ, vRecipW; -+ -+ vX = _mm_load_ps(workDesc.pTriBuffer); -+ vY = _mm_load_ps(workDesc.pTriBuffer + 4); -+ vZ = _mm_load_ps(workDesc.pTriBuffer + 8); -+ vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); -+ -+ // triangle 0 -+ // v0,v1 -> v0,v0,v1 -+ __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0)); -+ __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0)); -+ __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0)); -+ __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0)); -+ -+ __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth); -+ __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0); -+ if (workDesc.triFlags.yMajor) -+ { -+ vXa = _mm_add_ps(vAdjust, vXa); -+ } -+ else -+ { -+ vYa = _mm_add_ps(vAdjust, vYa); -+ } -+ -+ // Store triangle description for rasterizer -+ _mm_store_ps((float*)&newTriBuffer[0], vXa); -+ _mm_store_ps((float*)&newTriBuffer[4], vYa); -+ _mm_store_ps((float*)&newTriBuffer[8], vZa); -+ _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); -+ -+ // binner bins 3 edges for lines as v0, v1, v1 -+ // tri0 needs v0, v0, v1 -+ for (uint32_t a = 0; a < workDesc.numAttribs; ++a) -+ { -+ __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]); -+ __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]); -+ -+ _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0); -+ _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0); -+ _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1); -+ } -+ -+ // Store user clip distances for triangle 0 -+ float newClipBuffer[3 * 8]; -+ uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask); -+ if (numClipDist) -+ { -+ newWorkDesc.pUserClipBuffer = newClipBuffer; -+ -+ float* pOldBuffer = workDesc.pUserClipBuffer; -+ float* pNewBuffer = newClipBuffer; -+ for (uint32_t i = 0; i < numClipDist; ++i) -+ { -+ // read barycentric coeffs from binner -+ float a = *(pOldBuffer++); -+ float b = *(pOldBuffer++); -+ -+ // reconstruct original clip distance at vertices -+ float c0 = a + b; -+ float c1 = b; -+ -+ // construct triangle barycentrics -+ *(pNewBuffer++) = c0 - c1; -+ *(pNewBuffer++) = c0 - c1; -+ *(pNewBuffer++) = c1; -+ } -+ } -+ -+ // make sure this macrotile intersects the triangle -+ __m128i vXai = fpToFixedPoint(vXa); -+ __m128i vYai = fpToFixedPoint(vYa); -+ OSALIGN(BBOX, 16) bboxA; -+ calcBoundingBoxInt(vXai, vYai, bboxA); -+ -+ if (!(bboxA.left > macroBoxRight || -+ bboxA.left > state.scissorInFixedPoint.right || -+ bboxA.right - 1 < macroBoxLeft || -+ bboxA.right - 1 < state.scissorInFixedPoint.left || -+ bboxA.top > macroBoxBottom || -+ bboxA.top > state.scissorInFixedPoint.bottom || -+ bboxA.bottom - 1 < macroBoxTop || -+ bboxA.bottom - 1 < state.scissorInFixedPoint.top)) { -+ // rasterize triangle -+ RasterizeTriangle(pDC, workerId, macroTile, (void*)&newWorkDesc); -+ } -+ -+ // triangle 1 -+ // v0,v1 -> v1,v1,v0 -+ vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1)); -+ vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1)); -+ vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1)); -+ vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1)); -+ -+ vAdjust = _mm_mul_ps(vLineWidth, vBloat1); -+ if (workDesc.triFlags.yMajor) -+ { -+ vXa = _mm_add_ps(vAdjust, vXa); -+ } -+ else -+ { -+ vYa = _mm_add_ps(vAdjust, vYa); -+ } -+ -+ // Store triangle description for rasterizer -+ _mm_store_ps((float*)&newTriBuffer[0], vXa); -+ _mm_store_ps((float*)&newTriBuffer[4], vYa); -+ _mm_store_ps((float*)&newTriBuffer[8], vZa); -+ _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); -+ -+ // binner bins 3 edges for lines as v0, v1, v1 -+ // tri1 needs v1, v1, v0 -+ for (uint32_t a = 0; a < workDesc.numAttribs; ++a) -+ { -+ __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]); -+ __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]); -+ -+ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1); -+ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1); -+ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0); -+ } -+ -+ // store user clip distance for triangle 1 -+ if (numClipDist) -+ { -+ float* pOldBuffer = workDesc.pUserClipBuffer; -+ float* pNewBuffer = newClipBuffer; -+ for (uint32_t i = 0; i < numClipDist; ++i) -+ { -+ // read barycentric coeffs from binner -+ float a = *(pOldBuffer++); -+ float b = *(pOldBuffer++); -+ -+ // reconstruct original clip distance at vertices -+ float c0 = a + b; -+ float c1 = b; -+ -+ // construct triangle barycentrics -+ *(pNewBuffer++) = c1 - c0; -+ *(pNewBuffer++) = c1 - c0; -+ *(pNewBuffer++) = c0; -+ } -+ } -+ -+ vXai = fpToFixedPoint(vXa); -+ vYai = fpToFixedPoint(vYa); -+ calcBoundingBoxInt(vXai, vYai, bboxA); -+ -+ if (!(bboxA.left > macroBoxRight || -+ bboxA.left > state.scissorInFixedPoint.right || -+ bboxA.right - 1 < macroBoxLeft || -+ bboxA.right - 1 < state.scissorInFixedPoint.left || -+ bboxA.top > macroBoxBottom || -+ bboxA.top > state.scissorInFixedPoint.bottom || -+ bboxA.bottom - 1 < macroBoxTop || -+ bboxA.bottom - 1 < state.scissorInFixedPoint.top)) { -+ // rasterize triangle -+ RasterizeTriangle(pDC, workerId, macroTile, (void*)&newWorkDesc); -+ } -+ -+ RDTSC_STOP(BERasterizeLine, 1, 0); -+} -+ -diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h -new file mode 100644 -index 0000000..e07d7ea ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h -@@ -0,0 +1,34 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file rasterizer.h -+* -+* @brief Definitions for the rasterizer. -+* -+******************************************************************************/ -+#pragma once -+ -+#include "context.h" -+ -+void rastPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -+extern PFN_WORK_FUNC gRasterizerTable[SWR_MULTISAMPLE_TYPE_MAX]; -+void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp -new file mode 100644 -index 0000000..df96f72 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp -@@ -0,0 +1,90 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+****************************************************************************/ -+ -+#include "rdtsc_core.h" -+#include "common/rdtsc_buckets.h" -+ -+// must match CORE_BUCKETS enum order -+BUCKET_DESC gCoreBuckets[] = { -+ { "APIClearRenderTarget", "", true, 0xff0b8bea }, -+ { "APIDraw", "", true, 0xff000066 }, -+ { "APIDrawWakeAllThreads", "", false, 0xffffffff }, -+ { "APIDrawIndexed", "", true, 0xff000066 }, -+ { "APIDispatch", "", true, 0xff660000 }, -+ { "APIStoreTiles", "", true, 0xff00ffff }, -+ { "APIGetDrawContext", "", false, 0xffffffff }, -+ { "APISync", "", true, 0xff6666ff }, -+ { "FEProcessDraw", "", true, 0xff009900 }, -+ { "FEProcessDrawIndexed", "", true, 0xff009900 }, -+ { "FEFetchShader", "", false, 0xffffffff }, -+ { "FEVertexShader", "", false, 0xffffffff }, -+ { "FEHullShader", "", false, 0xffffffff }, -+ { "FETessellation", "", false, 0xffffffff }, -+ { "FEDomainShader", "", false, 0xffffffff }, -+ { "FEGeometryShader", "", false, 0xffffffff }, -+ { "FEStreamout", "", false, 0xffffffff }, -+ { "FEPAAssemble", "", false, 0xffffffff }, -+ { "FEBinPoints", "", false, 0xff29b854 }, -+ { "FEBinLines", "", false, 0xff29b854 }, -+ { "FEBinTriangles", "", false, 0xff29b854 }, -+ { "FETriangleSetup", "", false, 0xffffffff }, -+ { "FEViewportCull", "", false, 0xffffffff }, -+ { "FEGuardbandClip", "", false, 0xffffffff }, -+ { "FEClipPoints", "", false, 0xffffffff }, -+ { "FEClipLines", "", false, 0xffffffff }, -+ { "FEClipTriangles", "", false, 0xffffffff }, -+ { "FECullZeroAreaAndBackface", "", false, 0xffffffff }, -+ { "FECullBetweenCenters", "", false, 0xffffffff }, -+ { "FEProcessStoreTiles", "", true, 0xff39c864 }, -+ { "FEProcessInvalidateTiles", "", true, 0xffffffff }, -+ { "WorkerWorkOnFifoBE", "", false, 0xff40261c }, -+ { "WorkerFoundWork", "", false, 0xff573326 }, -+ { "BELoadTiles", "", true, 0xffb0e2ff }, -+ { "BEDispatch", "", true, 0xff00a2ff }, -+ { "BEClear", "", true, 0xff00ccbb }, -+ { "BERasterizeLine", "", true, 0xffb26a4e }, -+ { "BERasterizeTriangle", "", true, 0xffb26a4e }, -+ { "BETriangleSetup", "", false, 0xffffffff }, -+ { "BEStepSetup", "", false, 0xffffffff }, -+ { "BECullZeroArea", "", false, 0xffffffff }, -+ { "BEEmptyTriangle", "", false, 0xffffffff }, -+ { "BETrivialAccept", "", false, 0xffffffff }, -+ { "BETrivialReject", "", false, 0xffffffff }, -+ { "BERasterizePartial", "", false, 0xffffffff }, -+ { "BEPixelBackend", "", false, 0xffffffff }, -+ { "BESetup", "", false, 0xffffffff }, -+ { "BEBarycentric", "", false, 0xffffffff }, -+ { "BEEarlyDepthTest", "", false, 0xffffffff }, -+ { "BEPixelShader", "", false, 0xffffffff }, -+ { "BELateDepthTest", "", false, 0xffffffff }, -+ { "BEOutputMerger", "", false, 0xffffffff }, -+ { "BEStoreTiles", "", true, 0xff00cccc }, -+ { "BEEndTile", "", false, 0xffffffff }, -+ { "WorkerWaitForThreadEvent", "", false, 0xffffffff }, -+}; -+ -+/// @todo bucketmanager and mapping should probably be a part of the SWR context -+std::vector gBucketMap; -+BucketManager gBucketMgr(false); -+ -+uint32_t gCurrentFrame = 0; -diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h -new file mode 100644 -index 0000000..1e3700d ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h -@@ -0,0 +1,175 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+****************************************************************************/ -+ -+#pragma once -+#include "knobs.h" -+ -+#include "common/os.h" -+#include "common/rdtsc_buckets.h" -+ -+#include -+ -+enum CORE_BUCKETS -+{ -+ APIClearRenderTarget, -+ APIDraw, -+ APIDrawWakeAllThreads, -+ APIDrawIndexed, -+ APIDispatch, -+ APIStoreTiles, -+ APIGetDrawContext, -+ APISync, -+ FEProcessDraw, -+ FEProcessDrawIndexed, -+ FEFetchShader, -+ FEVertexShader, -+ FEHullShader, -+ FETessellation, -+ FEDomainShader, -+ FEGeometryShader, -+ FEStreamout, -+ FEPAAssemble, -+ FEBinPoints, -+ FEBinLines, -+ FEBinTriangles, -+ FETriangleSetup, -+ FEViewportCull, -+ FEGuardbandClip, -+ FEClipPoints, -+ FEClipLines, -+ FEClipTriangles, -+ FECullZeroAreaAndBackface, -+ FECullBetweenCenters, -+ FEProcessStoreTiles, -+ FEProcessInvalidateTiles, -+ WorkerWorkOnFifoBE, -+ WorkerFoundWork, -+ BELoadTiles, -+ BEDispatch, -+ BEClear, -+ BERasterizeLine, -+ BERasterizeTriangle, -+ BETriangleSetup, -+ BEStepSetup, -+ BECullZeroArea, -+ BEEmptyTriangle, -+ BETrivialAccept, -+ BETrivialReject, -+ BERasterizePartial, -+ BEPixelBackend, -+ BESetup, -+ BEBarycentric, -+ BEEarlyDepthTest, -+ BEPixelShader, -+ BELateDepthTest, -+ BEOutputMerger, -+ BEStoreTiles, -+ BEEndTile, -+ WorkerWaitForThreadEvent, -+ -+ NumBuckets -+}; -+ -+void rdtscReset(); -+void rdtscInit(int threadId); -+void rdtscStart(uint32_t bucketId); -+void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId); -+void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2); -+void rdtscEndFrame(); -+ -+#ifdef KNOB_ENABLE_RDTSC -+#define RDTSC_RESET() rdtscReset() -+#define RDTSC_INIT(threadId) rdtscInit(threadId) -+#define RDTSC_START(bucket) rdtscStart(bucket) -+#define RDTSC_STOP(bucket, count, draw) rdtscStop(bucket, count, draw) -+#define RDTSC_EVENT(bucket, count1, count2) rdtscEvent(bucket, count1, count2) -+#define RDTSC_ENDFRAME() rdtscEndFrame() -+#else -+#define RDTSC_RESET() -+#define RDTSC_INIT(threadId) -+#define RDTSC_START(bucket) -+#define RDTSC_STOP(bucket, count, draw) -+#define RDTSC_EVENT(bucket, count1, count2) -+#define RDTSC_ENDFRAME() -+#endif -+ -+extern std::vector gBucketMap; -+extern BucketManager gBucketMgr; -+extern BUCKET_DESC gCoreBuckets[]; -+extern uint32_t gCurrentFrame; -+ -+INLINE void rdtscReset() -+{ -+ gCurrentFrame = 0; -+ gBucketMgr.ClearThreads(); -+ gBucketMgr.ClearBuckets(); -+} -+ -+INLINE void rdtscInit(int threadId) -+{ -+ // register all the buckets once -+ if (threadId == 0) -+ { -+ gBucketMap.resize(NumBuckets); -+ for (uint32_t i = 0; i < NumBuckets; ++i) -+ { -+ gBucketMap[i] = gBucketMgr.RegisterBucket(gCoreBuckets[i]); -+ } -+ } -+ -+ std::string name = threadId == 0 ? "API" : "WORKER"; -+ gBucketMgr.RegisterThread(name); -+} -+ -+INLINE void rdtscStart(uint32_t bucketId) -+{ -+ uint32_t id = gBucketMap[bucketId]; -+ gBucketMgr.StartBucket(id); -+} -+ -+INLINE void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId) -+{ -+ uint32_t id = gBucketMap[bucketId]; -+ gBucketMgr.StopBucket(id); -+} -+ -+INLINE void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2) -+{ -+ -+} -+ -+INLINE void rdtscEndFrame() -+{ -+ gCurrentFrame++; -+ -+ if (gCurrentFrame == KNOB_BUCKETS_START_FRAME) -+ { -+ gBucketMgr.StartCapture(); -+ } -+ -+ if (gCurrentFrame == KNOB_BUCKETS_END_FRAME) -+ { -+ gBucketMgr.StopCapture(); -+ gBucketMgr.PrintReport("rdtsc.txt"); -+ } -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h -new file mode 100644 -index 0000000..ad8b91fc ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/state.h -@@ -0,0 +1,918 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file state.h -+* -+* @brief Definitions for API state. -+* -+******************************************************************************/ -+#pragma once -+ -+#include "common/formats.h" -+#include "common/simdintrin.h" -+ -+// clear flags -+#define SWR_CLEAR_NONE 0 -+#define SWR_CLEAR_COLOR (1 << 0) -+#define SWR_CLEAR_DEPTH (1 << 1) -+#define SWR_CLEAR_STENCIL (1 << 2) -+ -+enum DRIVER_TYPE -+{ -+ DX, -+ GL -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// PRIMITIVE_TOPOLOGY. -+////////////////////////////////////////////////////////////////////////// -+enum PRIMITIVE_TOPOLOGY -+{ -+ TOP_UNKNOWN = 0x0, -+ TOP_POINT_LIST = 0x1, -+ TOP_LINE_LIST = 0x2, -+ TOP_LINE_STRIP = 0x3, -+ TOP_TRIANGLE_LIST = 0x4, -+ TOP_TRIANGLE_STRIP = 0x5, -+ TOP_TRIANGLE_FAN = 0x6, -+ TOP_QUAD_LIST = 0x7, -+ TOP_QUAD_STRIP = 0x8, -+ TOP_LINE_LIST_ADJ = 0x9, -+ TOP_LISTSTRIP_ADJ = 0xA, -+ TOP_TRI_LIST_ADJ = 0xB, -+ TOP_TRI_STRIP_ADJ = 0xC, -+ TOP_TRI_STRIP_REVERSE = 0xD, -+ TOP_POLYGON = 0xE, -+ TOP_RECT_LIST = 0xF, -+ TOP_LINE_LOOP = 0x10, -+ TOP_POINT_LIST_BF = 0x11, -+ TOP_LINE_STRIP_CONT = 0x12, -+ TOP_LINE_STRIP_BF = 0x13, -+ TOP_LINE_STRIP_CONT_BF = 0x14, -+ TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16, -+ TOP_TRIANGLE_DISC = 0x17, /// @todo What is this?? -+ -+ TOP_PATCHLIST_BASE = 0x1F, // Invalid topology, used to calculate num verts for a patchlist. -+ TOP_PATCHLIST_1 = 0x20, // List of 1-vertex patches -+ TOP_PATCHLIST_2 = 0x21, -+ TOP_PATCHLIST_3 = 0x22, -+ TOP_PATCHLIST_4 = 0x23, -+ TOP_PATCHLIST_5 = 0x24, -+ TOP_PATCHLIST_6 = 0x25, -+ TOP_PATCHLIST_7 = 0x26, -+ TOP_PATCHLIST_8 = 0x27, -+ TOP_PATCHLIST_9 = 0x28, -+ TOP_PATCHLIST_10 = 0x29, -+ TOP_PATCHLIST_11 = 0x2A, -+ TOP_PATCHLIST_12 = 0x2B, -+ TOP_PATCHLIST_13 = 0x2C, -+ TOP_PATCHLIST_14 = 0x2D, -+ TOP_PATCHLIST_15 = 0x2E, -+ TOP_PATCHLIST_16 = 0x2F, -+ TOP_PATCHLIST_17 = 0x30, -+ TOP_PATCHLIST_18 = 0x31, -+ TOP_PATCHLIST_19 = 0x32, -+ TOP_PATCHLIST_20 = 0x33, -+ TOP_PATCHLIST_21 = 0x34, -+ TOP_PATCHLIST_22 = 0x35, -+ TOP_PATCHLIST_23 = 0x36, -+ TOP_PATCHLIST_24 = 0x37, -+ TOP_PATCHLIST_25 = 0x38, -+ TOP_PATCHLIST_26 = 0x39, -+ TOP_PATCHLIST_27 = 0x3A, -+ TOP_PATCHLIST_28 = 0x3B, -+ TOP_PATCHLIST_29 = 0x3C, -+ TOP_PATCHLIST_30 = 0x3D, -+ TOP_PATCHLIST_31 = 0x3E, -+ TOP_PATCHLIST_32 = 0x3F, // List of 32-vertex patches -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_SHADER_TYPE -+////////////////////////////////////////////////////////////////////////// -+enum SWR_SHADER_TYPE -+{ -+ SHADER_VERTEX, -+ SHADER_GEOMETRY, -+ SHADER_DOMAIN, -+ SHADER_HULL, -+ SHADER_PIXEL, -+ SHADER_COMPUTE, -+ -+ NUM_SHADER_TYPES, -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_RENDERTARGET_ATTACHMENT -+/// @todo Its not clear what an "attachment" means. Its not common term. -+////////////////////////////////////////////////////////////////////////// -+enum SWR_RENDERTARGET_ATTACHMENT -+{ -+ SWR_ATTACHMENT_COLOR0, -+ SWR_ATTACHMENT_COLOR1, -+ SWR_ATTACHMENT_COLOR2, -+ SWR_ATTACHMENT_COLOR3, -+ SWR_ATTACHMENT_COLOR4, -+ SWR_ATTACHMENT_COLOR5, -+ SWR_ATTACHMENT_COLOR6, -+ SWR_ATTACHMENT_COLOR7, -+ SWR_ATTACHMENT_DEPTH, -+ SWR_ATTACHMENT_STENCIL, -+ -+ SWR_NUM_ATTACHMENTS -+}; -+ -+#define SWR_NUM_RENDERTARGETS 8 -+ -+#define SWR_ATTACHMENT_COLOR0_BIT 0x001 -+#define SWR_ATTACHMENT_COLOR1_BIT 0x002 -+#define SWR_ATTACHMENT_COLOR2_BIT 0x004 -+#define SWR_ATTACHMENT_COLOR3_BIT 0x008 -+#define SWR_ATTACHMENT_COLOR4_BIT 0x010 -+#define SWR_ATTACHMENT_COLOR5_BIT 0x020 -+#define SWR_ATTACHMENT_COLOR6_BIT 0x040 -+#define SWR_ATTACHMENT_COLOR7_BIT 0x080 -+#define SWR_ATTACHMENT_DEPTH_BIT 0x100 -+#define SWR_ATTACHMENT_STENCIL_BIT 0x200 -+#define SWR_ATTACHMENT_MASK_ALL 0x3ff -+#define SWR_ATTACHMENT_MASK_COLOR 0x0ff -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SWR Inner Tessellation factor ID -+/// See above GetTessFactorOutputPosition code for documentation -+enum SWR_INNER_TESSFACTOR_ID -+{ -+ SWR_QUAD_U_TRI_INSIDE, -+ SWR_QUAD_V_INSIDE, -+ -+ SWR_NUM_INNER_TESS_FACTORS, -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief SWR Outer Tessellation factor ID -+/// See above GetTessFactorOutputPosition code for documentation -+enum SWR_OUTER_TESSFACTOR_ID -+{ -+ SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL, -+ SWR_QUAD_V_EQ0_TRI_V_LINE_DENSITY, -+ SWR_QUAD_U_EQ1_TRI_W, -+ SWR_QUAD_V_EQ1, -+ -+ SWR_NUM_OUTER_TESS_FACTORS, -+}; -+ -+ -+///////////////////////////////////////////////////////////////////////// -+/// simdvertex -+/// @brief Defines a vertex element that holds all the data for SIMD vertices. -+/// Contains position in clip space, hardcoded to attribute 0, -+/// space for up to 32 attributes, as well as any SGV values generated -+/// by the pipeline (to be implemented) -+///////////////////////////////////////////////////////////////////////// -+#define VERTEX_POSITION_SLOT 0 -+#define VERTEX_ATTRIB_START_SLOT 1 -+#define VERTEX_ATTRIB_END_SLOT 32 -+#define VERTEX_RTAI_SLOT 33 // GS will write RenderTargetArrayIndex here -+#define VERTEX_PRIMID_SLOT 34 // GS will write PrimId here -+#define VERTEX_CLIPCULL_DIST_LO_SLOT 35 // VS will write lower 4 clip/cull dist -+#define VERTEX_CLIPCULL_DIST_HI_SLOT 36 // VS will write upper 4 clip/cull dist -+static_assert(VERTEX_CLIPCULL_DIST_HI_SLOT < KNOB_NUM_ATTRIBUTES, "Mismatched attribute slot size"); -+ -+// SoAoSoA -+struct simdvertex -+{ -+ simdvector attrib[KNOB_NUM_ATTRIBUTES]; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_VS_CONTEXT -+/// @brief Input to vertex shader -+///////////////////////////////////////////////////////////////////////// -+struct SWR_VS_CONTEXT -+{ -+ simdvertex* pVin; // IN: SIMD input vertex data store -+ simdvertex* pVout; // OUT: SIMD output vertex data store -+ -+ uint32_t InstanceID; // IN: Instance ID, constant across all verts of the SIMD -+ simdscalari VertexID; // IN: Vertex ID -+ simdscalari mask; // IN: Active mask for shader -+}; -+ -+///////////////////////////////////////////////////////////////////////// -+/// ScalarCPoint -+/// @brief defines a control point element as passed from the output -+/// of the hull shader to the input of the domain shader -+///////////////////////////////////////////////////////////////////////// -+struct ScalarAttrib -+{ -+ float x; -+ float y; -+ float z; -+ float w; -+}; -+ -+struct ScalarCPoint -+{ -+ ScalarAttrib attrib[KNOB_NUM_ATTRIBUTES]; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_TESSELLATION_FACTORS -+/// @brief Tessellation factors structure (non-vector) -+///////////////////////////////////////////////////////////////////////// -+struct SWR_TESSELLATION_FACTORS -+{ -+ float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS]; -+ float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS]; -+}; -+ -+#define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches -+struct ScalarPatch -+{ -+ SWR_TESSELLATION_FACTORS tessFactors; -+ ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM]; -+ ScalarCPoint patchData; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_HS_CONTEXT -+/// @brief Input to hull shader -+///////////////////////////////////////////////////////////////////////// -+struct SWR_HS_CONTEXT -+{ -+ simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data -+ simdscalari PrimitiveID; // IN: (SIMD) primitive ID generated from the draw call -+ ScalarPatch* pCPout; // OUT: Output control point patch -+ // SIMD-sized-array of SCALAR patches -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_DS_CONTEXT -+/// @brief Input to domain shader -+///////////////////////////////////////////////////////////////////////// -+struct SWR_DS_CONTEXT -+{ -+ uint32_t PrimitiveID; // IN: (SCALAR) PrimitiveID for the patch associated with the DS invocation -+ uint32_t vectorOffset; // IN: (SCALAR) vector index offset into SIMD data. -+ uint32_t vectorStride; // IN: (SCALAR) stride (in vectors) of output data per attribute-component -+ ScalarPatch* pCpIn; // IN: (SCALAR) Control patch -+ simdscalar* pDomainU; // IN: (SIMD) Domain Point U coords -+ simdscalar* pDomainV; // IN: (SIMD) Domain Point V coords -+ simdscalar* pOutputData; // OUT: (SIMD) Vertex Attributes (2D array of vectors, one row per attribute-component) -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_GS_CONTEXT -+/// @brief Input to geometry shader. -+///////////////////////////////////////////////////////////////////////// -+struct SWR_GS_CONTEXT -+{ -+ simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims -+ simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call -+ uint32_t InstanceID; // IN: input instance ID -+ uint8_t* pStream[4]; // OUT: output streams -+ uint8_t* pCutBuffer; // OUT: cut buffer -+ simdscalari vertexCount; // OUT: num vertices emitted per SIMD lane -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_PS_CONTEXT -+/// @brief Input to pixel shader. -+///////////////////////////////////////////////////////////////////////// -+struct SWR_PS_CONTEXT -+{ -+ simdscalar vX; // IN: x location of pixels -+ simdscalar vY; // IN: y location of pixels -+ simdscalar vZ; // INOUT: z location of pixels -+ simdscalari mask; // INOUT: mask for kill -+ -+ // rasterizer generated barycentric components -+ simdscalar vI; // IN: Barycentric I component -+ simdscalar vJ; // IN: Barycentric J component -+ simdscalar vOneOverW; // IN: 1/w -+ -+ const float* pAttribs; // IN: pointer to attribute barycentric coefficients -+ const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients -+ const float *I; // IN: Barycentric A, B, and C coefs used to compute I -+ const float *J; // IN: Barycentric A, B, and C coefs used to compute J -+ float recipDet; // IN: 1/Det, used when barycentric interpolating attributes -+ const float* pSamplePos; // IN: array of sample positions -+ simdvector shaded[SWR_NUM_RENDERTARGETS]; // OUT: result color per rendertarget -+ -+ uint32_t frontFace; // IN: front- 1, back- 0 -+ uint32_t primID; // IN: primitive ID -+ uint32_t sampleIndex; // IN: sampleIndex -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_CS_CONTEXT -+/// @brief Input to compute shader. -+///////////////////////////////////////////////////////////////////////// -+struct SWR_CS_CONTEXT -+{ -+ // The ThreadGroupId is the current thread group index relative -+ // to all thread groups in the Dispatch call. The ThreadId, ThreadIdInGroup, -+ // and ThreadIdInGroupFlattened can be derived from ThreadGroupId in the shader. -+ -+ // Compute shader accepts the following system values. -+ // o ThreadId - Current thread id relative to all other threads in dispatch. -+ // o ThreadGroupId - Current thread group id relative to all other groups in dispatch. -+ // o ThreadIdInGroup - Current thread relative to all threads in the current thread group. -+ // o ThreadIdInGroupFlattened - Flattened linear id derived from ThreadIdInGroup. -+ // -+ // All of these system values can be computed in the shader. They will be -+ // derived from the current tile counter. The tile counter is an atomic counter that -+ // resides in the draw context and is initialized to the product of the dispatch dims. -+ // -+ // tileCounter = dispatchDims.x * dispatchDims.y * dispatchDims.z -+ // -+ // Each CPU worker thread will atomically decrement this counter and passes the current -+ // count into the shader. When the count reaches 0 then all thread groups in the -+ // dispatch call have been completed. -+ -+ uint32_t tileCounter; // The tile counter value for this thread group. -+ -+ // Dispatch dimensions used by shader to compute system values from the tile counter. -+ uint32_t dispatchDims[3]; -+ -+ uint8_t* pTGSM; // Thread Group Shared Memory pointer. -+}; -+ -+// enums -+enum SWR_TILE_MODE -+{ -+ SWR_TILE_NONE = 0x0, // Linear mode (no tiling) -+ SWR_TILE_MODE_WMAJOR, // W major tiling -+ SWR_TILE_MODE_XMAJOR, // X major tiling -+ SWR_TILE_MODE_YMAJOR, // Y major tiling -+ SWR_TILE_SWRZ, // SWR-Z tiling -+ -+ SWR_TILE_MODE_COUNT -+}; -+ -+enum SWR_SURFACE_TYPE -+{ -+ SURFACE_1D = 0, -+ SURFACE_2D = 1, -+ SURFACE_3D = 2, -+ SURFACE_CUBE = 3, -+ SURFACE_BUFFER = 4, -+ SURFACE_STRUCTURED_BUFFER = 5, -+ SURFACE_NULL = 7 -+}; -+ -+enum SWR_ZFUNCTION -+{ -+ ZFUNC_ALWAYS, -+ ZFUNC_NEVER, -+ ZFUNC_LT, -+ ZFUNC_EQ, -+ ZFUNC_LE, -+ ZFUNC_GT, -+ ZFUNC_NE, -+ ZFUNC_GE, -+ NUM_ZFUNC -+}; -+ -+enum SWR_STENCILOP -+{ -+ STENCILOP_KEEP, -+ STENCILOP_ZERO, -+ STENCILOP_REPLACE, -+ STENCILOP_INCRSAT, -+ STENCILOP_DECRSAT, -+ STENCILOP_INCR, -+ STENCILOP_DECR, -+ STENCILOP_INVERT -+}; -+ -+enum SWR_BLEND_FACTOR -+{ -+ BLENDFACTOR_ONE, -+ BLENDFACTOR_SRC_COLOR, -+ BLENDFACTOR_SRC_ALPHA, -+ BLENDFACTOR_DST_ALPHA, -+ BLENDFACTOR_DST_COLOR, -+ BLENDFACTOR_SRC_ALPHA_SATURATE, -+ BLENDFACTOR_CONST_COLOR, -+ BLENDFACTOR_CONST_ALPHA, -+ BLENDFACTOR_SRC1_COLOR, -+ BLENDFACTOR_SRC1_ALPHA, -+ BLENDFACTOR_ZERO, -+ BLENDFACTOR_INV_SRC_COLOR, -+ BLENDFACTOR_INV_SRC_ALPHA, -+ BLENDFACTOR_INV_DST_ALPHA, -+ BLENDFACTOR_INV_DST_COLOR, -+ BLENDFACTOR_INV_CONST_COLOR, -+ BLENDFACTOR_INV_CONST_ALPHA, -+ BLENDFACTOR_INV_SRC1_COLOR, -+ BLENDFACTOR_INV_SRC1_ALPHA -+}; -+ -+enum SWR_BLEND_OP -+{ -+ BLENDOP_ADD, -+ BLENDOP_SUBTRACT, -+ BLENDOP_REVSUBTRACT, -+ BLENDOP_MIN, -+ BLENDOP_MAX, -+}; -+ -+struct SWR_SURFACE_STATE -+{ -+ uint8_t *pBaseAddress; -+ SWR_SURFACE_TYPE type; // @llvm_enum -+ SWR_FORMAT format; // @llvm_enum -+ uint32_t width; -+ uint32_t height; -+ uint32_t depth; -+ uint32_t numSamples; -+ uint32_t pitch; -+ uint32_t qpitch; -+ uint32_t minLod; // for sampled surfaces, the most detailed LOD that can be accessed by sampler -+ uint32_t maxLod; // for sampled surfaces, the max LOD that can be accessed -+ float resourceMinLod; // for sampled surfaces, the most detailed fractional mip that can be accessed by sampler -+ uint32_t lod; // for render targets, the lod being rendered to -+ uint32_t arrayIndex; // for render targets, the array index being rendered to for arrayed surfaces -+ SWR_TILE_MODE tileMode; // @llvm_enum -+ uint32_t halign; -+ uint32_t valign; -+ -+ uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces -+ -+ uint8_t *pAuxBaseAddress; // Used for compression, append/consume counter, etc. -+}; -+ -+// vertex fetch state -+// WARNING- any changes to this struct need to be reflected -+// in the fetch shader jit -+struct SWR_VERTEX_BUFFER_STATE -+{ -+ uint32_t index; -+ uint32_t pitch; -+ const uint8_t *pData; -+ uint32_t size; -+ uint32_t numaNode; -+ uint32_t maxVertex; // size / pitch. precalculated value used by fetch shader for OOB checks -+ uint32_t partialInboundsSize; // size % pitch. precalculated value used by fetch shader for partially OOB vertices -+}; -+ -+struct SWR_INDEX_BUFFER_STATE -+{ -+ // Format type for indices (e.g. UINT16, UINT32, etc.) -+ SWR_FORMAT format; // @llvm_enum -+ const void *pIndices; -+ uint32_t size; -+}; -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_FETCH_CONTEXT -+/// @brief Input to fetch shader. -+/// @note WARNING - Changes to this struct need to be reflected in the -+/// fetch shader jit. -+///////////////////////////////////////////////////////////////////////// -+struct SWR_FETCH_CONTEXT -+{ -+ const SWR_VERTEX_BUFFER_STATE* pStreams; // IN: array of bound vertex buffers -+ const int32_t* pIndices; // IN: pointer to index buffer for indexed draws -+ const int32_t* pLastIndex; // IN: pointer to end of index buffer, used for bounds checking -+ uint32_t CurInstance; // IN: current instance -+ uint32_t BaseVertex; // IN: base vertex -+ uint32_t StartVertex; // IN: start vertex -+ uint32_t StartInstance; // IN: start instance -+ simdscalari VertexID; // OUT: vector of vertex IDs -+ simdscalari CutMask; // OUT: vector mask of indices which have the cut index value -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_STATS -+/// -+/// @brief All statistics generated by SWR go here. These are public -+/// to driver. -+///////////////////////////////////////////////////////////////////////// -+struct SWR_STATS -+{ -+ // Occlusion Query -+ uint64_t DepthPassCount; // Number of passing depth tests. Not exact. -+ -+ // Pipeline Stats -+ uint64_t IaVertices; // Number of Fetch Shader vertices -+ uint64_t IaPrimitives; // Number of PA primitives. -+ uint64_t VsInvocations; // Number of Vertex Shader invocations -+ uint64_t HsInvocations; // Number of Hull Shader invocations -+ uint64_t DsInvocations; // Number of Domain Shader invocations -+ uint64_t GsInvocations; // Number of Geometry Shader invocations -+ uint64_t PsInvocations; // Number of Pixel Shader invocations -+ uint64_t CsInvocations; // Number of Compute Shader invocations -+ uint64_t CInvocations; // Number of clipper invocations -+ uint64_t CPrimitives; // Number of clipper primitives. -+ uint64_t GsPrimitives; // Number of prims GS outputs. -+ -+ // Streamout Stats -+ uint32_t SoWriteOffset[4]; -+ uint64_t SoPrimStorageNeeded[4]; -+ uint64_t SoNumPrimsWritten[4]; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// STREAMOUT_BUFFERS -+///////////////////////////////////////////////////////////////////////// -+ -+#define MAX_SO_STREAMS 4 -+#define MAX_ATTRIBUTES 32 -+ -+struct SWR_STREAMOUT_BUFFER -+{ -+ bool enable; -+ -+ // Pointers to streamout buffers. -+ uint32_t* pBuffer; -+ -+ // Size of buffer in dwords. -+ uint32_t bufferSize; -+ -+ // Vertex pitch of buffer in dwords. -+ uint32_t pitch; -+ -+ // Offset into buffer in dwords. SOS will increment this offset. -+ uint32_t streamOffset; -+ -+ // Offset to the SO write offset. If not null then we update offset here. -+ uint32_t* pWriteOffset; -+ -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// STREAMOUT_STATE -+///////////////////////////////////////////////////////////////////////// -+struct SWR_STREAMOUT_STATE -+{ -+ // This disables stream output. -+ bool soEnable; -+ -+ // which streams are enabled for streamout -+ bool streamEnable[MAX_SO_STREAMS]; -+ -+ // If set then do not send any streams to the rasterizer. -+ bool rasterizerDisable; -+ -+ // Specifies which stream to send to the rasterizer. -+ uint32_t streamToRasterizer; -+ -+ // The stream masks specify which attributes are sent to which streams. -+ // These masks help the FE to setup the pPrimData buffer that is passed -+ // the the Stream Output Shader (SOS) function. -+ uint32_t streamMasks[MAX_SO_STREAMS]; -+ -+ // Number of attributes, including position, per vertex that are streamed out. -+ // This should match number of bits in stream mask. -+ uint32_t streamNumEntries[MAX_SO_STREAMS]; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// STREAMOUT_CONTEXT - Passed to SOS -+///////////////////////////////////////////////////////////////////////// -+struct SWR_STREAMOUT_CONTEXT -+{ -+ uint32_t* pPrimData; -+ SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS]; -+ -+ // Num prims written for this stream -+ uint32_t numPrimsWritten; -+ -+ // Num prims that should have been written if there were no overflow. -+ uint32_t numPrimStorageNeeded; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_GS_STATE - Geometry shader state -+///////////////////////////////////////////////////////////////////////// -+struct SWR_GS_STATE -+{ -+ bool gsEnable; -+ -+ // number of input attributes per vertex. used by the frontend to -+ // optimize assembling primitives for GS -+ uint32_t numInputAttribs; -+ -+ // output topology - can be point, tristrip, or linestrip -+ PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum -+ -+ // maximum number of verts that can be emitted by a single instance of the GS -+ uint32_t maxNumVerts; -+ -+ // instance count -+ uint32_t instanceCount; -+ -+ // geometry shader emits renderTargetArrayIndex -+ bool emitsRenderTargetArrayIndex; -+ -+ // geometry shader emits PrimitiveID -+ bool emitsPrimitiveID; -+}; -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS -+///////////////////////////////////////////////////////////////////////// -+enum SWR_TS_OUTPUT_TOPOLOGY -+{ -+ SWR_TS_OUTPUT_POINT, -+ SWR_TS_OUTPUT_LINE, -+ SWR_TS_OUTPUT_TRI_CW, -+ SWR_TS_OUTPUT_TRI_CCW, -+ -+ SWR_TS_OUTPUT_TOPOLOGY_COUNT -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_TS_PARTITIONING - Defines tessellation algorithm -+///////////////////////////////////////////////////////////////////////// -+enum SWR_TS_PARTITIONING -+{ -+ SWR_TS_INTEGER, -+ SWR_TS_ODD_FRACTIONAL, -+ SWR_TS_EVEN_FRACTIONAL, -+ -+ SWR_TS_PARTITIONING_COUNT -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_TS_DOMAIN - Defines Tessellation Domain -+///////////////////////////////////////////////////////////////////////// -+enum SWR_TS_DOMAIN -+{ -+ SWR_TS_QUAD, -+ SWR_TS_TRI, -+ SWR_TS_ISOLINE, -+ -+ SWR_TS_DOMAIN_COUNT -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_TS_STATE - Tessellation state -+///////////////////////////////////////////////////////////////////////// -+struct SWR_TS_STATE -+{ -+ bool tsEnable; -+ SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology; // @llvm_enum -+ SWR_TS_PARTITIONING partitioning; // @llvm_enum -+ SWR_TS_DOMAIN domain; // @llvm_enum -+ -+ PRIMITIVE_TOPOLOGY postDSTopology; // @llvm_enum -+ -+ uint32_t numHsInputAttribs; -+ uint32_t numHsOutputAttribs; -+ uint32_t numDsOutputAttribs; -+}; -+ -+// output merger state -+struct SWR_RENDER_TARGET_BLEND_STATE -+{ -+ uint32_t colorBlendEnable : 1; -+ uint32_t sourceAlphaBlendFactor : 5; -+ uint32_t destAlphaBlendFactor : 5; -+ uint32_t sourceBlendFactor : 5; -+ uint32_t destBlendFactor : 5; -+ uint32_t colorBlendFunc : 3; -+ uint32_t alphaBlendFunc : 3; -+ -+ uint32_t writeDisableRed : 1; -+ uint32_t writeDisableGreen : 1; -+ uint32_t writeDisableBlue : 1; -+ uint32_t writeDisableAlpha : 1; -+}; -+static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 4, "Invalid SWR_RENDER_TARGET_BLEND_STATE size"); -+ -+struct SWR_BLEND_STATE -+{ -+ float constantColor[4]; // constant blend factor color in RGBA float -+ bool independentAlphaBlendEnable; -+ SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS]; -+}; -+static_assert(sizeof(SWR_BLEND_STATE) == 52, "Invalid SWR_BLEND_STATE size"); -+ -+////////////////////////////////////////////////////////////////////////// -+/// FUNCTION POINTERS FOR SHADERS -+ -+typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); -+typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, SWR_VS_CONTEXT* pVsContext); -+typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, SWR_HS_CONTEXT* pHsContext); -+typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, SWR_DS_CONTEXT* pDsContext); -+typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsContext); -+typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext); -+typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext); -+typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); -+typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, BYTE*, simdvector&); -+ -+////////////////////////////////////////////////////////////////////////// -+/// FRONTEND_STATE -+///////////////////////////////////////////////////////////////////////// -+struct SWR_FRONTEND_STATE -+{ -+ // skip clip test, perspective divide, and viewport transform -+ // intended for verts in screen space -+ bool vpTransformDisable; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// VIEWPORT_MATRIX -+///////////////////////////////////////////////////////////////////////// -+struct SWR_VIEWPORT_MATRIX -+{ -+ float m00; -+ float m11; -+ float m22; -+ float m30; -+ float m31; -+ float m32; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_VIEWPORT -+///////////////////////////////////////////////////////////////////////// -+struct SWR_VIEWPORT -+{ -+ float x; -+ float y; -+ float width; -+ float height; -+ float minZ; -+ float maxZ; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_CULLMODE -+////////////////////////////////////////////////////////////////////////// -+enum SWR_CULLMODE -+{ -+ SWR_CULLMODE_BOTH, -+ SWR_CULLMODE_NONE, -+ SWR_CULLMODE_FRONT, -+ SWR_CULLMODE_BACK -+}; -+ -+enum SWR_FILLMODE -+{ -+ SWR_FILLMODE_POINT, -+ SWR_FILLMODE_WIREFRAME, -+ SWR_FILLMODE_SOLID -+}; -+ -+enum SWR_FRONTWINDING -+{ -+ SWR_FRONTWINDING_CW, -+ SWR_FRONTWINDING_CCW -+}; -+ -+#define SWR_MAX_NUM_MULTISAMPLES 16 -+enum SWR_MULTISAMPLE_COUNT -+{ -+ SWR_MULTISAMPLE_1X, -+ SWR_MULTISAMPLE_2X, -+ SWR_MULTISAMPLE_4X, -+ SWR_MULTISAMPLE_8X, -+ SWR_MULTISAMPLE_16X, -+ SWR_MULTISAMPLE_TYPE_MAX -+}; -+ -+enum SWR_PIXEL_LOCATION -+{ -+ SWR_PIXEL_LOCATION_CENTER, -+ SWR_PIXEL_LOCATION_UL, -+}; -+ -+// fixed point screen space sample locations within a pixel -+struct SWR_MULTISAMPLE_POS -+{ -+ uint32_t x; -+ uint32_t y; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// SWR_RASTSTATE -+////////////////////////////////////////////////////////////////////////// -+struct SWR_RASTSTATE -+{ -+ uint32_t cullMode : 2; -+ uint32_t fillMode : 2; -+ uint32_t frontWinding : 1; -+ uint32_t scissorEnable : 1; -+ uint32_t depthClipEnable : 1; -+ float pointSize; -+ float lineWidth; -+ -+ // point size output from the VS -+ bool pointParam; -+ uint32_t pointSizeAttrib; -+ -+ // point sprite -+ bool pointSpriteEnable; -+ bool pointSpriteTopOrigin; -+ uint32_t pointSpriteFESlot; -+ -+ // depth bias -+ float depthBias; -+ float slopeScaledDepthBias; -+ float depthBiasClamp; -+ SWR_FORMAT depthFormat; // @llvm_enum -+ -+ // multisample state -+ SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum -+ SWR_MULTISAMPLE_COUNT forcedSampleCount; // @llvm_enum -+ uint32_t pixelLocation; // UL or Center -+ uint32_t sampleMask; -+ uint8_t isSampleMasked[SWR_MAX_NUM_MULTISAMPLES]; -+ bool pixelOffset; // offset pixel positions by .5 in both the horizontal and vertical direction -+ SWR_MULTISAMPLE_POS iSamplePos[SWR_MAX_NUM_MULTISAMPLES]; -+ -+ // user clip/cull distance enables -+ uint8_t cullDistanceMask; -+ uint8_t clipDistanceMask; -+}; -+ -+// backend state -+struct SWR_BACKEND_STATE -+{ -+ uint32_t constantInterpolationMask; -+ uint8_t numAttributes; -+ uint8_t numComponents[KNOB_NUM_ATTRIBUTES]; -+}; -+ -+union SWR_DEPTH_STENCIL_STATE -+{ -+ struct -+ { -+ // dword 0 -+ uint32_t depthWriteEnable : 1; -+ uint32_t depthTestEnable : 1; -+ uint32_t stencilWriteEnable : 1; -+ uint32_t stencilTestEnable : 1; -+ uint32_t doubleSidedStencilTestEnable : 1; -+ -+ uint32_t depthTestFunc : 3; -+ uint32_t stencilTestFunc : 3; -+ -+ uint32_t backfaceStencilPassDepthPassOp : 3; -+ uint32_t backfaceStencilPassDepthFailOp : 3; -+ uint32_t backfaceStencilFailOp : 3; -+ uint32_t backfaceStencilTestFunc : 3; -+ uint32_t stencilPassDepthPassOp : 3; -+ uint32_t stencilPassDepthFailOp : 3; -+ uint32_t stencilFailOp : 3; -+ -+ // dword 1 -+ uint8_t backfaceStencilWriteMask; -+ uint8_t backfaceStencilTestMask; -+ uint8_t stencilWriteMask; -+ uint8_t stencilTestMask; -+ -+ // dword 2 -+ uint8_t backfaceStencilRefValue; -+ uint8_t stencilRefValue; -+ }; -+ uint32_t value[3]; -+}; -+ -+enum SWR_SHADING_RATE -+{ -+ SWR_SHADING_RATE_PIXEL, -+ SWR_SHADING_RATE_SAMPLE, -+ SWR_SHADING_RATE_COARSE, -+ SWR_SHADING_RATE_MAX, -+}; -+ -+// pixel shader state -+struct SWR_PS_STATE -+{ -+ // dword 0-1 -+ PFN_PIXEL_KERNEL pfnPixelShader; // @llvm_pfn -+ -+ // dword 2 -+ uint32_t killsPixel : 1; // pixel shader can kill pixels -+ uint32_t writesODepth : 1; // pixel shader writes to depth -+ uint32_t usesSourceDepth: 1; // pixel shader reads depth -+ uint32_t maxRTSlotUsed : 3; // maximum render target slot pixel shader writes to [0..7] -+ uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h -new file mode 100644 -index 0000000..915ac77 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/tessellator.h -@@ -0,0 +1,88 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file tessellator.h -+* -+* @brief Tessellator fixed function unit interface definition -+* -+******************************************************************************/ -+#pragma once -+ -+/// Allocate and initialize a new tessellation context -+HANDLE SWR_API TSInitCtx( -+ SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle) -+ SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm -+ SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology -+ void* pContextMem, ///< [IN] Memory to use for the context -+ size_t& memSize); ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required -+ -+/// Destroy & de-allocate tessellation context -+void SWR_API TSDestroyCtx( -+ HANDLE tsCtx); ///< [IN] Tessellation context to be destroyed -+ -+struct SWR_TS_TESSELLATED_DATA -+{ -+ uint32_t NumPrimitives; -+ uint32_t NumDomainPoints; -+ -+ uint32_t* ppIndices[3]; -+ float* pDomainPointsU; -+ float* pDomainPointsV; -+ // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i] -+}; -+ -+/// Perform Tessellation -+void SWR_API TSTessellate( -+ HANDLE tsCtx, ///< [IN] Tessellation Context -+ const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors -+ SWR_TS_TESSELLATED_DATA& tsTessellatedData); ///< [OUT] Tessellated Data -+ -+ -+ -+/// @TODO - Implement OSS tessellator -+ -+INLINE HANDLE SWR_API TSInitCtx( -+ SWR_TS_DOMAIN tsDomain, -+ SWR_TS_PARTITIONING tsPartitioning, -+ SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, -+ void* pContextMem, -+ size_t& memSize) -+{ -+ SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__); -+ return NULL; -+} -+ -+ -+INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx) -+{ -+ SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__); -+} -+ -+ -+INLINE void SWR_API TSTessellate( -+ HANDLE tsCtx, -+ const SWR_TESSELLATION_FACTORS& tsTessFactors, -+ SWR_TS_TESSELLATED_DATA& tsTessellatedData) -+{ -+ SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__); -+} -+ -diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp -new file mode 100644 -index 0000000..590bed4 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp -@@ -0,0 +1,884 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+****************************************************************************/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#if defined(__linux__) || defined(__gnu_linux__) -+#include -+#include -+#include -+#include -+#endif -+ -+#include "common/os.h" -+#include "context.h" -+#include "frontend.h" -+#include "backend.h" -+#include "rasterizer.h" -+#include "rdtsc_core.h" -+#include "tilemgr.h" -+#include "core/multisample.h" -+ -+// ThreadId -+struct Core -+{ -+ uint32_t procGroup = 0; -+ std::vector threadIds; -+}; -+ -+struct NumaNode -+{ -+ std::vector cores; -+}; -+ -+typedef std::vector CPUNumaNodes; -+ -+void CalculateProcessorTopology(CPUNumaNodes& out_nodes) -+{ -+ out_nodes.clear(); -+#if defined(_WIN32) -+ -+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; -+ DWORD bufSize = sizeof(buffer); -+ -+ BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize); -+ SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); -+ -+ uint32_t count = bufSize / buffer->Size; -+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer; -+ -+ for (uint32_t i = 0; i < count; ++i) -+ { -+ SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); -+ for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) -+ { -+ auto& gmask = pBuffer->Processor.GroupMask[g]; -+ uint32_t threadId = 0; -+ uint32_t procGroup = gmask.Group; -+ -+ Core* pCore = nullptr; -+ -+ uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); -+ -+ while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) -+ { -+ // clear mask -+ gmask.Mask &= ~(KAFFINITY(1) << threadId); -+ -+ // Find Numa Node -+ PROCESSOR_NUMBER procNum = {}; -+ procNum.Group = WORD(procGroup); -+ procNum.Number = UCHAR(threadId); -+ -+ uint32_t numaId = 0; -+ ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); -+ SWR_ASSERT(ret); -+ -+ // Store data -+ if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); -+ auto& numaNode = out_nodes[numaId]; -+ -+ uint32_t coreId = 0; -+ -+ if (nullptr == pCore) -+ { -+ numaNode.cores.push_back(Core()); -+ pCore = &numaNode.cores.back(); -+ pCore->procGroup = procGroup; -+#if !defined(_WIN64) -+ coreId = (uint32_t)numaNode.cores.size(); -+ if ((coreId * numThreads) >= 32) -+ { -+ // Windows doesn't return threadIds >= 32 for a processor group correctly -+ // when running a 32-bit application. -+ // Just save -1 as the threadId -+ threadId = uint32_t(-1); -+ } -+#endif -+ } -+ pCore->threadIds.push_back(threadId); -+ } -+ } -+ pBuffer = PtrAdd(pBuffer, pBuffer->Size); -+ } -+ -+ -+#elif defined(__linux__) || defined (__gnu_linux__) -+ -+ // Parse /proc/cpuinfo to get full topology -+ std::ifstream input("/proc/cpuinfo"); -+ std::string line; -+ char* c; -+ uint32_t threadId = uint32_t(-1); -+ uint32_t coreId = uint32_t(-1); -+ uint32_t numaId = uint32_t(-1); -+ -+ while (std::getline(input, line)) -+ { -+ if (line.find("processor") != std::string::npos) -+ { -+ if (threadId != uint32_t(-1)) -+ { -+ // Save information. -+ if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); -+ auto& numaNode = out_nodes[numaId]; -+ if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); -+ auto& core = numaNode.cores[coreId]; -+ -+ core.procGroup = coreId; -+ core.threadIds.push_back(threadId); -+ } -+ -+ auto data_start = line.find(": ") + 2; -+ threadId = std::strtoul(&line.c_str()[data_start], &c, 10); -+ continue; -+ } -+ if (line.find("core id") != std::string::npos) -+ { -+ auto data_start = line.find(": ") + 2; -+ coreId = std::strtoul(&line.c_str()[data_start], &c, 10); -+ continue; -+ } -+ if (line.find("physical id") != std::string::npos) -+ { -+ auto data_start = line.find(": ") + 2; -+ numaId = std::strtoul(&line.c_str()[data_start], &c, 10); -+ continue; -+ } -+ } -+ -+ if (threadId != uint32_t(-1)) -+ { -+ // Save information. -+ if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); -+ auto& numaNode = out_nodes[numaId]; -+ if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); -+ auto& core = numaNode.cores[coreId]; -+ -+ core.procGroup = coreId; -+ core.threadIds.push_back(threadId); -+ } -+ -+ for (uint32_t node = 0; node < out_nodes.size(); node++) { -+ auto& numaNode = out_nodes[node]; -+ auto it = numaNode.cores.begin(); -+ for ( ; it != numaNode.cores.end(); ) { -+ if (it->threadIds.size() == 0) -+ numaNode.cores.erase(it); -+ else -+ ++it; -+ } -+ } -+ -+#else -+ -+#error Unsupported platform -+ -+#endif -+} -+ -+ -+void bindThread(uint32_t threadId, uint32_t procGroupId = 0) -+{ -+#if defined(_WIN32) -+ { -+ GROUP_AFFINITY affinity = {}; -+ affinity.Group = procGroupId; -+ -+#if !defined(_WIN64) -+ if (threadId >= 32) -+ { -+ // In a 32-bit process on Windows it is impossible to bind -+ // to logical processors 32-63 within a processor group. -+ // In this case set the mask to 0 and let the system assign -+ // the processor. Hopefully it will make smart choices. -+ affinity.Mask = 0; -+ } -+ else -+#endif -+ { -+ affinity.Mask = KAFFINITY(1) << threadId; -+ } -+ -+ SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr); -+ } -+#else -+ cpu_set_t cpuset; -+ pthread_t thread = pthread_self(); -+ CPU_ZERO(&cpuset); -+ CPU_SET(threadId, &cpuset); -+ -+ pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); -+#endif -+} -+ -+INLINE -+uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext) -+{ -+ //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0); -+ //return result; -+ return pContext->DrawEnqueued; -+} -+ -+INLINE -+DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId) -+{ -+ return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT]; -+} -+ -+// returns true if dependency not met -+INLINE -+bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw) -+{ -+ return (pDC->dependency > lastRetiredDraw); -+} -+ -+void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. -+{ -+ // Load clear color into SIMD register... -+ float *pClearData = (float*)(pHotTile->clearData); -+ simdscalar valR = _simd_broadcast_ss(&pClearData[0]); -+ simdscalar valG = _simd_broadcast_ss(&pClearData[1]); -+ simdscalar valB = _simd_broadcast_ss(&pClearData[2]); -+ simdscalar valA = _simd_broadcast_ss(&pClearData[3]); -+ -+ float *pfBuf = (float*)pHotTile->pBuffer; -+ uint32_t numSamples = pHotTile->numSamples; -+ -+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) -+ { -+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) -+ { -+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) -+ { -+ _simd_store_ps(pfBuf, valR); -+ pfBuf += KNOB_SIMD_WIDTH; -+ _simd_store_ps(pfBuf, valG); -+ pfBuf += KNOB_SIMD_WIDTH; -+ _simd_store_ps(pfBuf, valB); -+ pfBuf += KNOB_SIMD_WIDTH; -+ _simd_store_ps(pfBuf, valA); -+ pfBuf += KNOB_SIMD_WIDTH; -+ } -+ } -+ } -+} -+ -+void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. -+{ -+ // Load clear color into SIMD register... -+ float *pClearData = (float*)(pHotTile->clearData); -+ simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); -+ -+ float *pfBuf = (float*)pHotTile->pBuffer; -+ uint32_t numSamples = pHotTile->numSamples; -+ -+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) -+ { -+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) -+ { -+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) -+ { -+ _simd_store_ps(pfBuf, valZ); -+ pfBuf += KNOB_SIMD_WIDTH; -+ } -+ } -+ } -+} -+ -+void ClearStencilHotTile(const HOTTILE* pHotTile) -+{ -+ // convert from F32 to U8. -+ uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); -+ //broadcast 32x into __m256i... -+ simdscalari valS = _simd_set1_epi8(clearVal); -+ -+ simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; -+ uint32_t numSamples = pHotTile->numSamples; -+ -+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) -+ { -+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) -+ { -+ // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. -+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) -+ { -+ _simd_store_si(pBuf, valS); -+ pBuf += 1; -+ } -+ } -+ } -+} -+ -+// for draw calls, we initialize the active hot tiles and perform deferred -+// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside -+// the draw routine itself mainly for performance, to avoid unnecessary setup -+// every triangle -+// @todo support deferred clear -+INLINE -+void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork) -+{ -+ const API_STATE& state = GetApiState(pDC); -+ HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; -+ const SWR_PS_STATE& psState = state.psState; -+ uint32_t numRTs = psState.maxRTSlotUsed + 1; -+ -+ uint32_t x, y; -+ MacroTileMgr::getTileIndices(macroID, x, y); -+ x *= KNOB_MACROTILE_X_DIM; -+ y *= KNOB_MACROTILE_Y_DIM; -+ -+ uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); -+ -+ // check RT if enabled -+ if (state.psState.pfnPixelShader != nullptr) -+ { -+ for (uint32_t rt = 0; rt < numRTs; ++rt) -+ { -+ HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, numSamples); -+ -+ if (pHotTile->state == HOTTILE_INVALID) -+ { -+ RDTSC_START(BELoadTiles); -+ // invalid hottile before draw requires a load from surface before we can draw to it -+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); -+ pHotTile->state = HOTTILE_DIRTY; -+ RDTSC_STOP(BELoadTiles, 0, 0); -+ } -+ else if (pHotTile->state == HOTTILE_CLEAR) -+ { -+ RDTSC_START(BELoadTiles); -+ // Clear the tile. -+ ClearColorHotTile(pHotTile); -+ pHotTile->state = HOTTILE_DIRTY; -+ RDTSC_STOP(BELoadTiles, 0, 0); -+ } -+ } -+ } -+ -+ // check depth if enabled -+ if (state.depthStencilState.depthTestEnable || state.depthStencilState.depthWriteEnable) -+ { -+ HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); -+ if (pHotTile->state == HOTTILE_INVALID) -+ { -+ RDTSC_START(BELoadTiles); -+ // invalid hottile before draw requires a load from surface before we can draw to it -+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); -+ pHotTile->state = HOTTILE_DIRTY; -+ RDTSC_STOP(BELoadTiles, 0, 0); -+ } -+ else if (pHotTile->state == HOTTILE_CLEAR) -+ { -+ RDTSC_START(BELoadTiles); -+ // Clear the tile. -+ ClearDepthHotTile(pHotTile); -+ pHotTile->state = HOTTILE_DIRTY; -+ RDTSC_STOP(BELoadTiles, 0, 0); -+ } -+ } -+ -+ // check stencil if enabled -+ if (state.depthStencilState.stencilTestEnable || state.depthStencilState.stencilWriteEnable) -+ { -+ HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); -+ if (pHotTile->state == HOTTILE_INVALID) -+ { -+ RDTSC_START(BELoadTiles); -+ // invalid hottile before draw requires a load from surface before we can draw to it -+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); -+ pHotTile->state = HOTTILE_DIRTY; -+ RDTSC_STOP(BELoadTiles, 0, 0); -+ } -+ else if (pHotTile->state == HOTTILE_CLEAR) -+ { -+ RDTSC_START(BELoadTiles); -+ // Clear the tile. -+ ClearStencilHotTile(pHotTile); -+ pHotTile->state = HOTTILE_DIRTY; -+ RDTSC_STOP(BELoadTiles, 0, 0); -+ } -+ } -+} -+ -+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, volatile uint64_t& curDrawBE) -+{ -+ // increment our current draw id to the first incomplete draw -+ uint64_t drawEnqueued = GetEnqueuedDraw(pContext); -+ while (curDrawBE < drawEnqueued) -+ { -+ DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; -+ -+ // If its not compute and FE is not done then break out of loop. -+ if (!pDC->doneFE && !pDC->isCompute) break; -+ -+ bool isWorkComplete = (pDC->isCompute) ? -+ pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete(); -+ -+ if (isWorkComplete) -+ { -+ curDrawBE++; -+ } -+ else -+ { -+ break; -+ } -+ } -+ -+ // If there are no more incomplete draws then return false. -+ return (curDrawBE >= drawEnqueued) ? false : true; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief If there is any BE work then go work on it. -+/// @param pContext - pointer to SWR context. -+/// @param workerId - The unique worker ID that is assigned to this thread. -+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread -+/// has its own curDrawBE counter and this ensures that each worker processes all the -+/// draws in order. -+/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its -+/// own set and each time it fails to lock a macrotile, because its already locked, -+/// then it will add that tile to the lockedTiles set. As a worker begins to work -+/// on future draws the lockedTiles ensure that it doesn't work on tiles that may -+/// still have work pending in a previous draw. Additionally, the lockedTiles is -+/// hueristic that can steer a worker back to the same macrotile that it had been -+/// working on in a previous draw. -+void WorkOnFifoBE( -+ SWR_CONTEXT *pContext, -+ uint32_t workerId, -+ volatile uint64_t &curDrawBE, -+ std::unordered_set& lockedTiles) -+{ -+ // Find the first incomplete draw that has pending work. If no such draw is found then -+ // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. -+ if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) -+ { -+ return; -+ } -+ -+ uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; -+ -+ // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. -+ lockedTiles.clear(); -+ -+ // Try to work on each draw in order of the available draws in flight. -+ // 1. If we're on curDrawBE, we can work on any macrotile that is available. -+ // 2. If we're trying to work on draws after curDrawBE, we are restricted to -+ // working on those macrotiles that are known to be complete in the prior draw to -+ // maintain order. The locked tiles provides the history to ensures this. -+ for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i) -+ { -+ DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; -+ -+ if (pDC->isCompute) return; // We don't look at compute work. -+ -+ // First wait for FE to be finished with this draw. This keeps threading model simple -+ // but if there are lots of bubbles between draws then serializing FE and BE may -+ // need to be revisited. -+ if (!pDC->doneFE) break; -+ -+ // If this draw is dependent on a previous draw then we need to bail. -+ if (CheckDependency(pContext, pDC, lastRetiredDraw)) -+ { -+ return; -+ } -+ -+ // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. -+ std::vector ¯oTiles = pDC->pTileMgr->getDirtyTiles(); -+ -+ for (uint32_t tileID : macroTiles) -+ { -+ MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID); -+ -+ // can only work on this draw if it's not in use by other threads -+ if (lockedTiles.find(tileID) == lockedTiles.end()) -+ { -+ if (tile.getNumQueued()) -+ { -+ if (tile.tryLock()) -+ { -+ BE_WORK *pWork; -+ -+ RDTSC_START(WorkerFoundWork); -+ -+ uint32_t numWorkItems = tile.getNumQueued(); -+ -+ if (numWorkItems != 0) -+ { -+ pWork = tile.peek(); -+ SWR_ASSERT(pWork); -+ if (pWork->type == DRAW) -+ { -+ InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc); -+ } -+ } -+ -+ while ((pWork = tile.peek()) != nullptr) -+ { -+ pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); -+ tile.dequeue(); -+ } -+ RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); -+ -+ _ReadWriteBarrier(); -+ -+ pDC->pTileMgr->markTileComplete(tileID); -+ -+ // Optimization: If the draw is complete and we're the last one to have worked on it then -+ // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. -+ if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) -+ { -+ // We can increment the current BE and safely move to next draw since we know this draw is complete. -+ curDrawBE++; -+ lastRetiredDraw++; -+ -+ lockedTiles.clear(); -+ break; -+ } -+ } -+ else -+ { -+ // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. -+ lockedTiles.insert(tileID); -+ } -+ } -+ } -+ } -+ } -+} -+ -+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawFE, UCHAR numaNode) -+{ -+ // Try to grab the next DC from the ring -+ uint64_t drawEnqueued = GetEnqueuedDraw(pContext); -+ while (curDrawFE < drawEnqueued) -+ { -+ uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT; -+ DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; -+ if (pDC->isCompute || pDC->doneFE || pDC->FeLock) -+ { -+ curDrawFE++; -+ } -+ else -+ { -+ break; -+ } -+ } -+ -+ uint64_t curDraw = curDrawFE; -+ while (curDraw < drawEnqueued) -+ { -+ uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; -+ DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; -+ -+ if (!pDC->isCompute && !pDC->FeLock) -+ { -+ uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0); -+ if (initial == 0) -+ { -+ // successfully grabbed the DC, now run the FE -+ pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc); -+ } -+ } -+ curDraw++; -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief If there is any compute work then go work on it. -+/// @param pContext - pointer to SWR context. -+/// @param workerId - The unique worker ID that is assigned to this thread. -+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread -+/// has its own curDrawBE counter and this ensures that each worker processes all the -+/// draws in order. -+void WorkOnCompute( -+ SWR_CONTEXT *pContext, -+ uint32_t workerId, -+ volatile uint64_t& curDrawBE) -+{ -+ if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) -+ { -+ return; -+ } -+ -+ uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; -+ -+ DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; -+ if (pDC->isCompute == false) return; -+ -+ // check dependencies -+ if (CheckDependency(pContext, pDC, lastRetiredDraw)) -+ { -+ return; -+ } -+ -+ SWR_ASSERT(pDC->pDispatch != nullptr); -+ DispatchQueue& queue = *pDC->pDispatch; -+ -+ // Is there any work remaining? -+ if (queue.getNumQueued() > 0) -+ { -+ bool lastToComplete = false; -+ -+ uint32_t threadGroupId = 0; -+ while (queue.getWork(threadGroupId)) -+ { -+ ProcessComputeBE(pDC, workerId, threadGroupId); -+ -+ lastToComplete = queue.finishedWork(); -+ } -+ -+ _ReadWriteBarrier(); -+ -+ if (lastToComplete) -+ { -+ SWR_ASSERT(queue.isWorkComplete() == true); -+ pDC->doneCompute = true; -+ } -+ } -+} -+ -+DWORD workerThread(LPVOID pData) -+{ -+ THREAD_DATA *pThreadData = (THREAD_DATA*)pData; -+ SWR_CONTEXT *pContext = pThreadData->pContext; -+ uint32_t threadId = pThreadData->threadId; -+ uint32_t workerId = pThreadData->workerId; -+ -+ bindThread(threadId, pThreadData->procGroupId); -+ -+ RDTSC_INIT(threadId); -+ -+ int numaNode = (int)pThreadData->numaId; -+ -+ // flush denormals to 0 -+ _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); -+ -+ // Track tiles locked by other threads. If we try to lock a macrotile and find its already -+ // locked then we'll add it to this list so that we don't try and lock it again. -+ std::unordered_set lockedTiles; -+ -+ // each worker has the ability to work on any of the queued draws as long as certain -+ // conditions are met. the data associated -+ // with a draw is guaranteed to be active as long as a worker hasn't signaled that he -+ // has moved on to the next draw when he determines there is no more work to do. The api -+ // thread will not increment the head of the dc ring until all workers have moved past the -+ // current head. -+ // the logic to determine what to work on is: -+ // 1- try to work on the FE any draw that is queued. For now there are no dependencies -+ // on the FE work, so any worker can grab any FE and process in parallel. Eventually -+ // we'll need dependency tracking to force serialization on FEs. The worker will try -+ // to pick an FE by atomically incrementing a counter in the swr context. he'll keep -+ // trying until he reaches the tail. -+ // 2- BE work must be done in strict order. we accomplish this today by pulling work off -+ // the oldest draw (ie the head) of the dcRing. the worker can determine if there is -+ // any work left by comparing the total # of binned work items and the total # of completed -+ // work items. If they are equal, then there is no more work to do for this draw, and -+ // the worker can safely increment its oldestDraw counter and move on to the next draw. -+ std::unique_lock lock(pContext->WaitLock, std::defer_lock); -+ while (pContext->threadPool.inThreadShutdown == false) -+ { -+ uint32_t loop = 0; -+ while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && pContext->WorkerBE[workerId] == pContext->DrawEnqueued) -+ { -+ _mm_pause(); -+ } -+ -+ if (pContext->WorkerBE[workerId] == pContext->DrawEnqueued) -+ { -+ lock.lock(); -+ -+ // check for thread idle condition again under lock -+ if (pContext->WorkerBE[workerId] != pContext->DrawEnqueued) -+ { -+ lock.unlock(); -+ continue; -+ } -+ -+ if (pContext->threadPool.inThreadShutdown) -+ { -+ lock.unlock(); -+ break; -+ } -+ -+ RDTSC_START(WorkerWaitForThreadEvent); -+ -+ pContext->FifosNotEmpty.wait(lock); -+ lock.unlock(); -+ -+ RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0); -+ -+ if (pContext->threadPool.inThreadShutdown) -+ { -+ break; -+ } -+ } -+ -+ RDTSC_START(WorkerWorkOnFifoBE); -+ WorkOnFifoBE(pContext, workerId, pContext->WorkerBE[workerId], lockedTiles); -+ RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); -+ -+ WorkOnCompute(pContext, workerId, pContext->WorkerBE[workerId]); -+ -+ WorkOnFifoFE(pContext, workerId, pContext->WorkerFE[workerId], numaNode); -+ } -+ -+ return 0; -+} -+ -+void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) -+{ -+ // Bind application thread to HW thread 0 -+ bindThread(0); -+ -+ CPUNumaNodes nodes; -+ CalculateProcessorTopology(nodes); -+ -+ uint32_t numHWNodes = (uint32_t)nodes.size(); -+ uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); -+ uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); -+ -+ uint32_t numNodes = numHWNodes; -+ uint32_t numCoresPerNode = numHWCoresPerNode; -+ uint32_t numHyperThreads = numHWHyperThreads; -+ -+ if (KNOB_MAX_NUMA_NODES) -+ { -+ numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES); -+ } -+ -+ if (KNOB_MAX_CORES_PER_NUMA_NODE) -+ { -+ numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE); -+ } -+ -+ if (KNOB_MAX_THREADS_PER_CORE) -+ { -+ numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE); -+ } -+ -+ // Calculate numThreads -+ uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; -+ -+ if (numThreads > KNOB_MAX_NUM_THREADS) -+ { -+ printf("WARNING: system thread count %u exceeds max %u, " -+ "performance will be degraded\n", -+ numThreads, KNOB_MAX_NUM_THREADS); -+ } -+ -+ if (numThreads == 1) -+ { -+ // If only 1 worker thread, try to move it to an available -+ // HW thread. If that fails, use the API thread. -+ if (numCoresPerNode < numHWCoresPerNode) -+ { -+ numCoresPerNode++; -+ } -+ else if (numHyperThreads < numHWHyperThreads) -+ { -+ numHyperThreads++; -+ } -+ else if (numNodes < numHWNodes) -+ { -+ numNodes++; -+ } -+ else -+ { -+ pPool->numThreads = 0; -+ SET_KNOB(SINGLE_THREADED, true); -+ return; -+ } -+ } -+ else -+ { -+ // Save a HW thread for the API thread. -+ numThreads--; -+ } -+ -+ pPool->numThreads = numThreads; -+ pContext->NumWorkerThreads = pPool->numThreads; -+ -+ pPool->inThreadShutdown = false; -+ pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); -+ -+ uint32_t workerId = 0; -+ for (uint32_t n = 0; n < numNodes; ++n) -+ { -+ auto& node = nodes[n]; -+ -+ uint32_t numCores = numCoresPerNode; -+ for (uint32_t c = 0; c < numCores; ++c) -+ { -+ auto& core = node.cores[c]; -+ for (uint32_t t = 0; t < numHyperThreads; ++t) -+ { -+ if (c == 0 && n == 0 && t == 0) -+ { -+ // Skip core 0, thread0 on node 0 to reserve for API thread -+ continue; -+ } -+ -+ pPool->pThreadData[workerId].workerId = workerId; -+ pPool->pThreadData[workerId].procGroupId = core.procGroup; -+ pPool->pThreadData[workerId].threadId = core.threadIds[t]; -+ pPool->pThreadData[workerId].numaId = n; -+ pPool->pThreadData[workerId].pContext = pContext; -+ pPool->threads[workerId] = new std::thread(workerThread, &pPool->pThreadData[workerId]); -+ -+ ++workerId; -+ } -+ } -+ } -+} -+ -+void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) -+{ -+ if (!KNOB_SINGLE_THREADED) -+ { -+ // Inform threads to finish up -+ std::unique_lock lock(pContext->WaitLock); -+ pPool->inThreadShutdown = true; -+ _mm_mfence(); -+ pContext->FifosNotEmpty.notify_all(); -+ lock.unlock(); -+ -+ // Wait for threads to finish and destroy them -+ for (uint32_t t = 0; t < pPool->numThreads; ++t) -+ { -+ pPool->threads[t]->join(); -+ delete(pPool->threads[t]); -+ } -+ -+ // Clean up data used by threads -+ free(pPool->pThreadData); -+ } -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h -new file mode 100644 -index 0000000..0c91bf8 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h -@@ -0,0 +1,62 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file threads.h -+* -+* @brief Definitions for SWR threading model. -+* -+******************************************************************************/ -+#pragma once -+ -+#include "knobs.h" -+ -+#include -+#include -+typedef std::thread* THREAD_PTR; -+ -+struct SWR_CONTEXT; -+ -+struct THREAD_DATA -+{ -+ uint32_t procGroupId; // Will always be 0 for non-Windows OS -+ uint32_t threadId; // within the procGroup for Windows -+ uint32_t numaId; // NUMA node id -+ uint32_t workerId; -+ SWR_CONTEXT *pContext; -+}; -+ -+ -+struct THREAD_POOL -+{ -+ THREAD_PTR threads[KNOB_MAX_NUM_THREADS]; -+ uint32_t numThreads; -+ volatile bool inThreadShutdown; -+ THREAD_DATA *pThreadData; -+}; -+ -+void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); -+void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); -+ -+// Expose FE and BE worker functions to the API thread if single threaded -+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawFE, UCHAR numaNode); -+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawBE, std::unordered_set &usedTiles); -+void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawBE); -diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp -new file mode 100644 -index 0000000..24b4b60 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp -@@ -0,0 +1,105 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file tilemgr.cpp -+* -+* @brief Implementation for Macro Tile Manager which provides the facilities -+* for threads to work on an macro tile. -+* -+******************************************************************************/ -+#include -+ -+#include "fifo.hpp" -+#include "tilemgr.h" -+ -+#define TILE_ID(x,y) ((x << 16 | y)) -+ -+// override new/delete for alignment -+void *MacroTileMgr::operator new(size_t size) -+{ -+ return _aligned_malloc(size, 64); -+} -+ -+void MacroTileMgr::operator delete(void *p) -+{ -+ _aligned_free(p); -+} -+ -+void* DispatchQueue::operator new(size_t size) -+{ -+ return _aligned_malloc(size, 64); -+} -+ -+void DispatchQueue::operator delete(void *p) -+{ -+ _aligned_free(p); -+} -+ -+MacroTileMgr::MacroTileMgr() -+{ -+} -+ -+void MacroTileMgr::initialize() -+{ -+ mWorkItemsProduced = 0; -+ mWorkItemsConsumed = 0; -+ -+ mDirtyTiles.clear(); -+} -+ -+void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) -+{ -+ // Should not enqueue more then what we have backing for in the hot tile manager. -+ SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); -+ SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); -+ -+ uint32_t id = TILE_ID(x, y); -+ -+ MacroTileQueue &tile = mTiles[id]; -+ tile.mWorkItemsFE++; -+ -+ if (tile.mWorkItemsFE == 1) -+ { -+ tile.clear(); -+ mDirtyTiles.push_back(id); -+ } -+ -+ mWorkItemsProduced++; -+ tile.enqueue_try_nosync(pWork); -+} -+ -+void MacroTileMgr::markTileComplete(uint32_t id) -+{ -+ SWR_ASSERT(mTiles.find(id) != mTiles.end()); -+ MacroTileQueue &tile = mTiles[id]; -+ uint32_t numTiles = tile.mWorkItemsFE; -+ InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles); -+ -+ _ReadWriteBarrier(); -+ tile.mWorkItemsBE += numTiles; -+ SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE); -+ -+ // clear out tile, but defer fifo clear until the next DC first queues to it. -+ // this prevents worker threads from constantly locking a completed macro tile -+ tile.mWorkItemsFE = 0; -+ tile.mWorkItemsBE = 0; -+} -diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h -new file mode 100644 -index 0000000..b537730 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h -@@ -0,0 +1,392 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file tilemgr.h -+* -+* @brief Definitions for Macro Tile Manager which provides the facilities -+* for threads to work on an macro tile. -+* -+******************************************************************************/ -+#pragma once -+ -+#include -+#include -+#include "common/formats.h" -+#include "fifo.hpp" -+#include "context.h" -+#include "format_traits.h" -+ -+////////////////////////////////////////////////////////////////////////// -+/// MacroTile - work queue for a tile. -+////////////////////////////////////////////////////////////////////////// -+struct MacroTileQueue -+{ -+ MacroTileQueue() -+ { -+ mFifo.initialize(); -+ } -+ -+ ~MacroTileQueue() { } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Returns number of work items queued for this tile. -+ uint32_t getNumQueued() -+ { -+ return mFifo.getNumQueued(); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Attempt to lock the work fifo. If already locked then return false. -+ bool tryLock() -+ { -+ return mFifo.tryLock(); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Clear fifo and unlock it. -+ void clear() -+ { -+ mFifo.clear(); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Peek at work sitting at the front of the fifo. -+ BE_WORK* peek() -+ { -+ return mFifo.peek(); -+ } -+ -+ bool enqueue_try_nosync(const BE_WORK* entry) -+ { -+ return mFifo.enqueue_try_nosync(entry); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Move to next work item -+ void dequeue() -+ { -+ mFifo.dequeue_noinc(); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Destroy fifo -+ void destroy() -+ { -+ mFifo.destroy(); -+ } -+ -+ ///@todo This will all be private. -+ uint32_t mWorkItemsFE = 0; -+ uint32_t mWorkItemsBE = 0; -+ -+private: -+ QUEUE mFifo; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// MacroTileMgr - Manages macrotiles for a draw. -+////////////////////////////////////////////////////////////////////////// -+class MacroTileMgr -+{ -+public: -+ MacroTileMgr(); -+ ~MacroTileMgr() -+ { -+ for (auto &tile : mTiles) -+ { -+ tile.second.destroy(); -+ } -+ } -+ -+ void initialize(); -+ INLINE std::vector& getDirtyTiles() { return mDirtyTiles; } -+ INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; } -+ void markTileComplete(uint32_t id); -+ -+ INLINE bool isWorkComplete() -+ { -+ return mWorkItemsProduced == mWorkItemsConsumed; -+ } -+ -+ void enqueue(uint32_t x, uint32_t y, BE_WORK *pWork); -+ -+ static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y) -+ { -+ y = tileID & 0xffff; -+ x = (tileID >> 16) & 0xffff; -+ } -+ -+ void *operator new(size_t size); -+ void operator delete (void *p); -+ -+private: -+ SWR_FORMAT mFormat; -+ std::unordered_map mTiles; -+ -+ // Any tile that has work queued to it is a dirty tile. -+ std::vector mDirtyTiles; -+ -+ OSALIGNLINE(LONG) mWorkItemsProduced; -+ OSALIGNLINE(volatile LONG) mWorkItemsConsumed; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// DispatchQueue - work queue for dispatch -+////////////////////////////////////////////////////////////////////////// -+class DispatchQueue -+{ -+public: -+ DispatchQueue() {} -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Setup the producer consumer counts. -+ void initialize(uint32_t totalTasks, void* pTaskData) -+ { -+ // The available and outstanding counts start with total tasks. -+ // At the start there are N tasks available and outstanding. -+ // When both the available and outstanding counts have reached 0 then all work has completed. -+ // When a worker starts on a threadgroup then it decrements the available count. -+ // When a worker completes a threadgroup then it decrements the outstanding count. -+ -+ mTasksAvailable = totalTasks; -+ mTasksOutstanding = totalTasks; -+ -+ mpTaskData = pTaskData; -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Returns number of tasks available for this dispatch. -+ uint32_t getNumQueued() -+ { -+ return (mTasksAvailable > 0) ? mTasksAvailable : 0; -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Atomically decrement the work available count. If the result -+ // is greater than 0 then we can on the associated thread group. -+ // Otherwise, there is no more work to do. -+ bool getWork(uint32_t& groupId) -+ { -+ LONG result = InterlockedDecrement(&mTasksAvailable); -+ -+ if (result >= 0) -+ { -+ groupId = result; -+ return true; -+ } -+ -+ return false; -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Atomically decrement the outstanding count. A worker is notifying -+ /// us that he just finished some work. Also, return true if we're -+ /// the last worker to complete this dispatch. -+ bool finishedWork() -+ { -+ LONG result = InterlockedDecrement(&mTasksOutstanding); -+ SWR_ASSERT(result >= 0, "Should never oversubscribe work"); -+ -+ return (result == 0) ? true : false; -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Work is complete once both the available/outstanding counts have reached 0. -+ bool isWorkComplete() -+ { -+ return ((mTasksAvailable <= 0) && -+ (mTasksOutstanding <= 0)); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Return pointer to task data. -+ const void* GetTasksData() -+ { -+ return mpTaskData; -+ } -+ -+ void *operator new(size_t size); -+ void operator delete (void *p); -+ -+ void* mpTaskData; // The API thread will set this up and the callback task function will interpet this. -+ -+ OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 }; -+ OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 }; -+}; -+ -+ -+enum HOTTILE_STATE -+{ -+ HOTTILE_INVALID, // tile is in unitialized state and should be loaded with surface contents before rendering -+ HOTTILE_CLEAR, // tile should be cleared -+ HOTTILE_DIRTY, // tile has been rendered to -+ HOTTILE_RESOLVED, // tile has been stored to memory -+}; -+ -+struct HOTTILE -+{ -+ BYTE *pBuffer; -+ HOTTILE_STATE state; -+ DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment? -+ uint32_t numSamples; -+ uint32_t renderTargetArrayIndex; // current render target array index loaded -+}; -+ -+union HotTileSet -+{ -+ struct -+ { -+ HOTTILE Color[SWR_NUM_RENDERTARGETS]; -+ HOTTILE Depth; -+ HOTTILE Stencil; -+ }; -+ HOTTILE Attachment[SWR_NUM_ATTACHMENTS]; -+}; -+ -+class HotTileMgr -+{ -+public: -+ HotTileMgr() -+ { -+ memset(&mHotTiles[0][0], 0, sizeof(mHotTiles)); -+ -+ // cache hottile size -+ for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i) -+ { -+ mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits::bpp / 8; -+ } -+ mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits::bpp / 8; -+ mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits::bpp / 8; -+ } -+ -+ ~HotTileMgr() -+ { -+ for (int x = 0; x < KNOB_NUM_HOT_TILES_X; ++x) -+ { -+ for (int y = 0; y < KNOB_NUM_HOT_TILES_Y; ++y) -+ { -+ for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a) -+ { -+ if (mHotTiles[x][y].Attachment[a].pBuffer != NULL) -+ { -+ _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer); -+ mHotTiles[x][y].Attachment[a].pBuffer = NULL; -+ } -+ } -+ } -+ } -+ } -+ -+ HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, -+ uint32_t renderTargetArrayIndex = 0) -+ { -+ uint32_t x, y; -+ MacroTileMgr::getTileIndices(macroID, x, y); -+ -+ assert(x < KNOB_NUM_HOT_TILES_X); -+ assert(y < KNOB_NUM_HOT_TILES_Y); -+ -+ HotTileSet &tile = mHotTiles[x][y]; -+ HOTTILE& hotTile = tile.Attachment[attachment]; -+ if (hotTile.pBuffer == NULL) -+ { -+ if (create) -+ { -+ uint32_t size = numSamples * mHotTileSize[attachment]; -+ hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); -+ hotTile.state = HOTTILE_INVALID; -+ hotTile.numSamples = numSamples; -+ hotTile.renderTargetArrayIndex = renderTargetArrayIndex; -+ } -+ else -+ { -+ return NULL; -+ } -+ } -+ else -+ { -+ // free the old tile and create a new one with enough space to hold all samples -+ if (numSamples > hotTile.numSamples) -+ { -+ // tile should be either uninitialized or resolved if we're deleting and switching to a -+ // new sample count -+ assert((hotTile.state == HOTTILE_INVALID) || -+ (hotTile.state == HOTTILE_RESOLVED)); -+ _aligned_free(hotTile.pBuffer); -+ -+ uint32_t size = numSamples * mHotTileSize[attachment]; -+ hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); -+ hotTile.state = HOTTILE_INVALID; -+ hotTile.numSamples = numSamples; -+ } -+ -+ // if requested render target array index isn't currently loaded, need to store out the current hottile -+ // and load the requested array slice -+ if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) -+ { -+ SWR_FORMAT format; -+ switch (attachment) -+ { -+ case SWR_ATTACHMENT_COLOR0: -+ case SWR_ATTACHMENT_COLOR1: -+ case SWR_ATTACHMENT_COLOR2: -+ case SWR_ATTACHMENT_COLOR3: -+ case SWR_ATTACHMENT_COLOR4: -+ case SWR_ATTACHMENT_COLOR5: -+ case SWR_ATTACHMENT_COLOR6: -+ case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; -+ case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; -+ case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; -+ default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; -+ } -+ -+ if (hotTile.state == HOTTILE_DIRTY) -+ { -+ pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment, -+ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); -+ } -+ -+ pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment, -+ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); -+ -+ hotTile.renderTargetArrayIndex = renderTargetArrayIndex; -+ hotTile.state = HOTTILE_DIRTY; -+ } -+ } -+ return &tile.Attachment[attachment]; -+ } -+ -+ HotTileSet &GetHotTile(uint32_t macroID) -+ { -+ uint32_t x, y; -+ MacroTileMgr::getTileIndices(macroID, x, y); -+ assert(x < KNOB_NUM_HOT_TILES_X); -+ assert(y < KNOB_NUM_HOT_TILES_Y); -+ -+ return mHotTiles[x][y]; -+ } -+ -+private: -+ HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; -+ uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; -+}; -+ -diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp -new file mode 100644 -index 0000000..f36452f ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp -@@ -0,0 +1,148 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file utils.cpp -+* -+* @brief Utilities used by SWR core. -+* -+******************************************************************************/ -+#if defined(_WIN32) -+ -+#include -+#include -+#include -+#include -+ -+using namespace Gdiplus; -+ -+int GetEncoderClsid(const WCHAR* format, CLSID* pClsid) -+{ -+ uint32_t num = 0; // number of image encoders -+ uint32_t size = 0; // size of the image encoder array in bytes -+ -+ ImageCodecInfo* pImageCodecInfo = nullptr; -+ -+ GetImageEncodersSize(&num, &size); -+ if(size == 0) -+ return -1; // Failure -+ -+ pImageCodecInfo = (ImageCodecInfo*)(malloc(size)); -+ if(pImageCodecInfo == nullptr) -+ return -1; // Failure -+ -+ GetImageEncoders(num, size, pImageCodecInfo); -+ -+ for(uint32_t j = 0; j < num; ++j) -+ { -+ if( wcscmp(pImageCodecInfo[j].MimeType, format) == 0 ) -+ { -+ *pClsid = pImageCodecInfo[j].Clsid; -+ free(pImageCodecInfo); -+ return j; // Success -+ } -+ } -+ -+ free(pImageCodecInfo); -+ return -1; // Failure -+} -+ -+void SaveImageToPNGFile( -+ const WCHAR *pFilename, -+ void *pBuffer, -+ uint32_t width, -+ uint32_t height) -+{ -+ // dump pixels to a png -+ // Initialize GDI+. -+ GdiplusStartupInput gdiplusStartupInput; -+ ULONG_PTR gdiplusToken; -+ GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr); -+ -+ Bitmap *bitmap = new Bitmap(width, height); -+ BYTE *pBytes = (BYTE*)pBuffer; -+ static const uint32_t bytesPerPixel = 4; -+ for (uint32_t y = 0; y < height; ++y) -+ for (uint32_t x = 0; x < width; ++x) -+ { -+ uint32_t pixel = *(uint32_t*)pBytes; -+ if (pixel == 0xcdcdcdcd) -+ { -+ pixel = 0xFFFF00FF; -+ } -+ else if (pixel == 0xdddddddd) -+ { -+ pixel = 0x80FF0000; -+ } -+ else -+ { -+ pixel |= 0xFF000000; -+ } -+ Color color(pixel); -+ bitmap->SetPixel(x, y, color); -+ pBytes += bytesPerPixel; -+ } -+ -+ // Save image. -+ CLSID pngClsid; -+ GetEncoderClsid(L"image/png", &pngClsid); -+ bitmap->Save(pFilename, &pngClsid, nullptr); -+ -+ delete bitmap; -+ -+ GdiplusShutdown(gdiplusToken); -+} -+ -+void OpenBitmapFromFile( -+ const WCHAR *pFilename, -+ void **pBuffer, -+ uint32_t *width, -+ uint32_t *height) -+{ -+ GdiplusStartupInput gdiplusStartupInput; -+ ULONG_PTR gdiplusToken; -+ GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr); -+ -+ Bitmap *bitmap = new Bitmap(pFilename); -+ -+ *width = bitmap->GetWidth(); -+ *height = bitmap->GetHeight(); -+ *pBuffer = new BYTE[*width * *height * 4]; // width * height * |RGBA| -+ -+ // The folder 'stb_image' contains a PNG open/close module which -+ // is far less painful than this is, yo. -+ Gdiplus::Color clr; -+ for (uint32_t y = 0, idx = 0; y < *height; ++y) -+ { -+ for (uint32_t x = 0; x < *width; ++x, idx += 4) -+ { -+ bitmap->GetPixel(x, *height - y - 1, &clr); -+ ((BYTE*)*pBuffer)[idx + 0] = clr.GetBlue(); -+ ((BYTE*)*pBuffer)[idx + 1] = clr.GetGreen(); -+ ((BYTE*)*pBuffer)[idx + 2] = clr.GetRed(); -+ ((BYTE*)*pBuffer)[idx + 3] = clr.GetAlpha(); -+ } -+ } -+ -+ delete bitmap; -+ bitmap = 0; -+} -+#endif -diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h -new file mode 100644 -index 0000000..63d6ca1 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h -@@ -0,0 +1,745 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file utils.h -+* -+* @brief Utilities used by SWR core. -+* -+******************************************************************************/ -+#pragma once -+ -+#include -+#include "common/os.h" -+#include "common/simdintrin.h" -+#include "common/swr_assert.h" -+ -+#if defined(_WIN32) -+void SaveImageToPNGFile( -+ const WCHAR *pFilename, -+ void *pBuffer, -+ uint32_t width, -+ uint32_t height); -+ -+void OpenBitmapFromFile( -+ const WCHAR *pFilename, -+ void **pBuffer, -+ uint32_t *width, -+ uint32_t *height); -+#endif -+ -+/// @todo assume linux is always 64 bit -+#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__) -+#define _MM_INSERT_EPI64 _mm_insert_epi64 -+#define _MM_EXTRACT_EPI64 _mm_extract_epi64 -+#else -+INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) -+{ -+ OSALIGNLINE(uint32_t) elems[4]; -+ _mm_store_si128((__m128i*)elems, a); -+ if (ndx == 0) -+ { -+ uint64_t foo = elems[0]; -+ foo |= (uint64_t)elems[1] << 32; -+ return foo; -+ } -+ else -+ { -+ uint64_t foo = elems[2]; -+ foo |= (uint64_t)elems[3] << 32; -+ return foo; -+ } -+} -+ -+INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx) -+{ -+ OSALIGNLINE(int64_t) elems[2]; -+ _mm_store_si128((__m128i*)elems, a); -+ if (ndx == 0) -+ { -+ elems[0] = b; -+ } -+ else -+ { -+ elems[1] = b; -+ } -+ __m128i out; -+ out = _mm_load_si128((const __m128i*)elems); -+ return out; -+} -+#endif -+ -+OSALIGNLINE(struct) BBOX -+{ -+ int top, bottom, left, right; -+ -+ BBOX() {} -+ BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {} -+ -+ bool operator==(const BBOX& rhs) -+ { -+ return (this->top == rhs.top && -+ this->bottom == rhs.bottom && -+ this->left == rhs.left && -+ this->right == rhs.right); -+ } -+ -+ bool operator!=(const BBOX& rhs) -+ { -+ return !(*this == rhs); -+ } -+}; -+ -+struct simdBBox -+{ -+ simdscalari top, bottom, left, right; -+}; -+ -+INLINE -+void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) -+{ -+ __m128i row0i = _mm_castps_si128(row0); -+ __m128i row1i = _mm_castps_si128(row1); -+ __m128i row2i = _mm_castps_si128(row2); -+ __m128i row3i = _mm_castps_si128(row3); -+ -+ __m128i vTemp = row2i; -+ row2i = _mm_unpacklo_epi32(row2i, row3i); -+ vTemp = _mm_unpackhi_epi32(vTemp, row3i); -+ -+ row3i = row0i; -+ row0i = _mm_unpacklo_epi32(row0i, row1i); -+ row3i = _mm_unpackhi_epi32(row3i, row1i); -+ -+ row1i = row0i; -+ row0i = _mm_unpacklo_epi64(row0i, row2i); -+ row1i = _mm_unpackhi_epi64(row1i, row2i); -+ -+ row2i = row3i; -+ row2i = _mm_unpacklo_epi64(row2i, vTemp); -+ row3i = _mm_unpackhi_epi64(row3i, vTemp); -+ -+ row0 = _mm_castsi128_ps(row0i); -+ row1 = _mm_castsi128_ps(row1i); -+ row2 = _mm_castsi128_ps(row2i); -+ row3 = _mm_castsi128_ps(row3i); -+} -+ -+INLINE -+void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) -+{ -+ __m128i vTemp = row2; -+ row2 = _mm_unpacklo_epi32(row2, row3); -+ vTemp = _mm_unpackhi_epi32(vTemp, row3); -+ -+ row3 = row0; -+ row0 = _mm_unpacklo_epi32(row0, row1); -+ row3 = _mm_unpackhi_epi32(row3, row1); -+ -+ row1 = row0; -+ row0 = _mm_unpacklo_epi64(row0, row2); -+ row1 = _mm_unpackhi_epi64(row1, row2); -+ -+ row2 = row3; -+ row2 = _mm_unpacklo_epi64(row2, vTemp); -+ row3 = _mm_unpackhi_epi64(row3, vTemp); -+} -+ -+#define GCC_VERSION (__GNUC__ * 10000 \ -+ + __GNUC_MINOR__ * 100 \ -+ + __GNUC_PATCHLEVEL__) -+ -+#if defined(__GNUC__) && (GCC_VERSION < 40900) -+#define _mm_undefined_ps _mm_setzero_ps -+#define _mm_undefined_si128 _mm_setzero_si128 -+#if KNOB_SIMD_WIDTH == 8 -+#define _mm256_undefined_ps _mm256_setzero_ps -+#endif -+#endif -+ -+#if KNOB_SIMD_WIDTH == 8 -+INLINE -+void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2) -+{ -+ __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 -+ __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 -+ __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 -+ __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 -+ -+ r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 -+ r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 -+ __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 -+ __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 -+ -+ vDst[0] = _mm256_castps256_ps128(r02r1xlolo); -+ vDst[1] = _mm256_castps256_ps128(r02r1xlohi); -+ vDst[2] = _mm256_castps256_ps128(r02r1xhilo); -+ vDst[3] = _mm256_castps256_ps128(r02r1xhihi); -+ -+ vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); -+ vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); -+ vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); -+ vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); -+} -+ -+INLINE -+void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3) -+{ -+ __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 -+ __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 -+ __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 -+ __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 -+ -+ r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 -+ r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 -+ __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 -+ __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 -+ -+ vDst[0] = _mm256_castps256_ps128(r02r1xlolo); -+ vDst[1] = _mm256_castps256_ps128(r02r1xlohi); -+ vDst[2] = _mm256_castps256_ps128(r02r1xhilo); -+ vDst[3] = _mm256_castps256_ps128(r02r1xhihi); -+ -+ vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); -+ vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); -+ vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); -+ vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); -+} -+ -+INLINE -+void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) -+{ -+ __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); -+ __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); -+ __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); -+ __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); -+ __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); -+ __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); -+ __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); -+ __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); -+ __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); -+ __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); -+ __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); -+ __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); -+ __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); -+ __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); -+ __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); -+ __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); -+ vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); -+ vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); -+ vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); -+ vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); -+ vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); -+ vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); -+ vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); -+ vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); -+} -+ -+INLINE -+void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) -+{ -+ vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), -+ _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); -+} -+#endif -+ -+////////////////////////////////////////////////////////////////////////// -+/// TranposeSingleComponent -+////////////////////////////////////////////////////////////////////////// -+template -+struct TransposeSingleComponent -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Pass-thru for single component. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) -+ { -+ memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose8_8_8_8 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose8_8_8_8 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) -+ { -+ simdscalari src = _simd_load_si((const simdscalari*)pSrc); -+#if KNOB_SIMD_WIDTH == 8 -+#if KNOB_ARCH == KNOB_ARCH_AVX -+ __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg -+ __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa -+ __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb -+ __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa -+ __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg -+ __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa -+ __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba -+ __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba -+ _mm_store_si128((__m128i*)pDst, c0123lo); -+ _mm_store_si128((__m128i*)(pDst + 16), c0123hi); -+#elif KNOB_ARCH == KNOB_ARCH_AVX2 -+ simdscalari dst01 = _mm256_shuffle_epi8(src, -+ _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); -+ simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); -+ dst23 = _mm256_shuffle_epi8(dst23, -+ _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); -+ simdscalari dst = _mm256_or_si256(dst01, dst23); -+ _simd_store_si((simdscalari*)pDst, dst); -+#endif -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose8_8_8 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose8_8_8 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose8_8 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose8_8 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 8_8 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) -+ { -+ simdscalari src = _simd_load_si((const simdscalari*)pSrc); -+ -+#if KNOB_SIMD_WIDTH == 8 -+ __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg -+ __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg -+ rg = _mm_unpacklo_epi8(rg, g); -+ _mm_store_si128((__m128i*)pDst, rg); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose32_32_32_32 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose32_32_32_32 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ simdscalar src0 = _simd_load_ps((const float*)pSrc); -+ simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); -+ simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); -+ simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); -+ -+ __m128 vDst[8]; -+ vTranspose4x8(vDst, src0, src1, src2, src3); -+ _mm_store_ps((float*)pDst, vDst[0]); -+ _mm_store_ps((float*)pDst+4, vDst[1]); -+ _mm_store_ps((float*)pDst+8, vDst[2]); -+ _mm_store_ps((float*)pDst+12, vDst[3]); -+ _mm_store_ps((float*)pDst+16, vDst[4]); -+ _mm_store_ps((float*)pDst+20, vDst[5]); -+ _mm_store_ps((float*)pDst+24, vDst[6]); -+ _mm_store_ps((float*)pDst+28, vDst[7]); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose32_32_32 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose32_32_32 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ simdscalar src0 = _simd_load_ps((const float*)pSrc); -+ simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); -+ simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); -+ -+ __m128 vDst[8]; -+ vTranspose3x8(vDst, src0, src1, src2); -+ _mm_store_ps((float*)pDst, vDst[0]); -+ _mm_store_ps((float*)pDst + 4, vDst[1]); -+ _mm_store_ps((float*)pDst + 8, vDst[2]); -+ _mm_store_ps((float*)pDst + 12, vDst[3]); -+ _mm_store_ps((float*)pDst + 16, vDst[4]); -+ _mm_store_ps((float*)pDst + 20, vDst[5]); -+ _mm_store_ps((float*)pDst + 24, vDst[6]); -+ _mm_store_ps((float*)pDst + 28, vDst[7]); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose32_32 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose32_32 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 32_32 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) -+ { -+ const float* pfSrc = (const float*)pSrc; -+ __m128 src_r0 = _mm_load_ps(pfSrc + 0); -+ __m128 src_r1 = _mm_load_ps(pfSrc + 4); -+ __m128 src_g0 = _mm_load_ps(pfSrc + 8); -+ __m128 src_g1 = _mm_load_ps(pfSrc + 12); -+ -+ __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); -+ __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); -+ __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); -+ __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); -+ -+ float* pfDst = (float*)pDst; -+ _mm_store_ps(pfDst + 0, dst0); -+ _mm_store_ps(pfDst + 4, dst1); -+ _mm_store_ps(pfDst + 8, dst2); -+ _mm_store_ps(pfDst + 12, dst3); -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose16_16_16_16 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose16_16_16_16 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); -+ simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); -+ -+ __m128i src_r = _mm256_extractf128_si256(src_rg, 0); -+ __m128i src_g = _mm256_extractf128_si256(src_rg, 1); -+ __m128i src_b = _mm256_extractf128_si256(src_ba, 0); -+ __m128i src_a = _mm256_extractf128_si256(src_ba, 1); -+ -+ __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); -+ __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); -+ __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); -+ __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); -+ -+ __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); -+ __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); -+ __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); -+ __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); -+ -+ _mm_store_si128(((__m128i*)pDst) + 0, dst0); -+ _mm_store_si128(((__m128i*)pDst) + 1, dst1); -+ _mm_store_si128(((__m128i*)pDst) + 2, dst2); -+ _mm_store_si128(((__m128i*)pDst) + 3, dst3); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose16_16_16 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose16_16_16 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) -+ { -+#if KNOB_SIMD_WIDTH == 8 -+ simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); -+ -+ __m128i src_r = _mm256_extractf128_si256(src_rg, 0); -+ __m128i src_g = _mm256_extractf128_si256(src_rg, 1); -+ __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); -+ __m128i src_a = _mm_undefined_si128(); -+ -+ __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); -+ __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); -+ __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); -+ __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); -+ -+ __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); -+ __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); -+ __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); -+ __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); -+ -+ _mm_store_si128(((__m128i*)pDst) + 0, dst0); -+ _mm_store_si128(((__m128i*)pDst) + 1, dst1); -+ _mm_store_si128(((__m128i*)pDst) + 2, dst2); -+ _mm_store_si128(((__m128i*)pDst) + 3, dst3); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose16_16 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose16_16 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 16_16 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) -+ { -+ simdscalar src = _simd_load_ps((const float*)pSrc); -+ -+#if KNOB_SIMD_WIDTH == 8 -+ __m128 comp0 = _mm256_castps256_ps128(src); -+ __m128 comp1 = _mm256_extractf128_ps(src, 1); -+ -+ __m128i comp0i = _mm_castps_si128(comp0); -+ __m128i comp1i = _mm_castps_si128(comp1); -+ -+ __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); -+ __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); -+ -+ _mm_store_si128((__m128i*)pDst, resLo); -+ _mm_store_si128((__m128i*)pDst + 1, resHi); -+#else -+#error Unsupported vector width -+#endif -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose4_4_4_4 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose4_4_4_4 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose5_6_5 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose5_6_5 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose9_9_9_5 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose9_9_9_5 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose5_5_5_1 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose5_5_5_1 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose10_10_10_2 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose10_10_10_2 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Transpose11_11_10 -+////////////////////////////////////////////////////////////////////////// -+struct Transpose11_11_10 -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. -+ /// @param pSrc - source data in SOA form -+ /// @param pDst - output data in AOS form -+ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; -+}; -+ -+// helper function to unroll loops -+template -+struct UnrollerL { -+ template -+ INLINE static void step(Lambda& func) { -+ func(Begin); -+ UnrollerL::step(func); -+ } -+}; -+ -+template -+struct UnrollerL { -+ template -+ static void step(Lambda& func) { -+ } -+}; -+ -+// general CRC compute -+INLINE -+uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) -+{ -+#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__) -+ uint32_t sizeInQwords = size / sizeof(uint64_t); -+ uint32_t sizeRemainderBytes = size % sizeof(uint64_t); -+ uint64_t* pDataWords = (uint64_t*)pData; -+ for (uint32_t i = 0; i < sizeInQwords; ++i) -+ { -+ crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); -+ } -+#else -+ uint32_t sizeInDwords = size / sizeof(uint32_t); -+ uint32_t sizeRemainderBytes = size % sizeof(uint32_t); -+ uint32_t* pDataWords = (uint32_t*)pData; -+ for (uint32_t i = 0; i < sizeInDwords; ++i) -+ { -+ crc = _mm_crc32_u32(crc, *pDataWords++); -+ } -+#endif -+ -+ BYTE* pRemainderBytes = (BYTE*)pDataWords; -+ for (uint32_t i = 0; i < sizeRemainderBytes; ++i) -+ { -+ crc = _mm_crc32_u8(crc, *pRemainderBytes++); -+ } -+ -+ return crc; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// Add byte offset to any-type pointer -+////////////////////////////////////////////////////////////////////////// -+template -+INLINE -+static T* PtrAdd(T* p, intptr_t offset) -+{ -+ intptr_t intp = reinterpret_cast(p); -+ return reinterpret_cast(intp + offset); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// Is a power-of-2? -+////////////////////////////////////////////////////////////////////////// -+template -+INLINE -+static bool IsPow2(T value) -+{ -+ return value == (value & (0 - value)); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// Align down to specified alignment -+/// Note: IsPow2(alignment) MUST be true -+////////////////////////////////////////////////////////////////////////// -+template -+INLINE -+static T1 AlignDownPow2(T1 value, T2 alignment) -+{ -+ SWR_ASSERT(IsPow2(alignment)); -+ return value & ~T1(alignment - 1); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// Align up to specified alignment -+/// Note: IsPow2(alignment) MUST be true -+////////////////////////////////////////////////////////////////////////// -+template -+INLINE -+static T1 AlignUpPow2(T1 value, T2 alignment) -+{ -+ return AlignDownPow2(value + T1(alignment - 1), alignment); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// Align down to specified alignment -+////////////////////////////////////////////////////////////////////////// -+template -+INLINE -+static T1 AlignDown(T1 value, T2 alignment) -+{ -+ if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } -+ return value - T1(value % alignment); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// Align up to specified alignment -+/// Note: IsPow2(alignment) MUST be true -+////////////////////////////////////////////////////////////////////////// -+template -+INLINE -+static T1 AlignUp(T1 value, T2 alignment) -+{ -+ return AlignDown(value + T1(alignment - 1), alignment); -+} -+ -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp -new file mode 100644 -index 0000000..726b508 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp -@@ -0,0 +1,292 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file JitManager.cpp -+* -+* @brief Implementation if the Jit Manager. -+* -+* Notes: -+* -+******************************************************************************/ -+#if defined(_WIN32) -+#pragma warning(disable: 4800 4146 4244 4267 4355 4996) -+#endif -+ -+#include "jit_api.h" -+#include "JitManager.h" -+#include "fetch_jit.h" -+ -+#if defined(_WIN32) -+#include "llvm/ADT/Triple.h" -+#endif -+#include "llvm/IR/Function.h" -+#include "llvm/Support/DynamicLibrary.h" -+ -+#include "llvm/Support/MemoryBuffer.h" -+#include "llvm/Support/SourceMgr.h" -+#include "llvm/IRReader/IRReader.h" -+ -+#include "core/state.h" -+#include "common/containers.hpp" -+ -+#include "state_llvm.h" -+ -+#include -+#if defined(_WIN32) -+#include -+#include -+ -+#define INTEL_OUTPUT_DIR "c:\\Intel" -+#define RASTY_OUTPUT_DIR INTEL_OUTPUT_DIR "\\Rasty" -+#define JITTER_OUTPUT_DIR RASTY_OUTPUT_DIR "\\Jitter" -+#endif -+ -+using namespace llvm; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Contructor for JitManager. -+/// @param simdWidth - SIMD width to be used in generated program. -+JitManager::JitManager(uint32_t simdWidth, const char *arch) -+ : mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), mArch(arch) -+{ -+ InitializeNativeTarget(); -+ InitializeNativeTargetAsmPrinter(); -+ InitializeNativeTargetDisassembler(); -+ -+ TargetOptions tOpts; -+ tOpts.AllowFPOpFusion = FPOpFusion::Fast; -+ tOpts.NoInfsFPMath = false; -+ tOpts.NoNaNsFPMath = false; -+ tOpts.UnsafeFPMath = true; -+#if defined(_DEBUG) -+ tOpts.NoFramePointerElim = true; -+#endif -+ -+ //tOpts.PrintMachineCode = true; -+ -+ std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate); -+ fnName << mJitNumber++; -+ std::unique_ptr newModule(new Module(fnName.str(), mContext)); -+ mpCurrentModule = newModule.get(); -+ -+ auto &&EB = EngineBuilder(std::move(newModule)); -+ EB.setTargetOptions(tOpts); -+ EB.setOptLevel(CodeGenOpt::Aggressive); -+ -+ StringRef hostCPUName; -+ -+ // force JIT to use the same CPU arch as the rest of rasty -+ if(mArch.AVX512F()) -+ { -+ assert(0 && "Implement AVX512 jitter"); -+ hostCPUName = sys::getHostCPUName(); -+ if (mVWidth == 0) -+ { -+ mVWidth = 16; -+ } -+ } -+ else if(mArch.AVX2()) -+ { -+ hostCPUName = StringRef("core-avx2"); -+ if (mVWidth == 0) -+ { -+ mVWidth = 8; -+ } -+ } -+ else if(mArch.AVX()) -+ { -+ if (mArch.F16C()) -+ { -+ hostCPUName = StringRef("core-avx-i"); -+ } -+ else -+ { -+ hostCPUName = StringRef("corei7-avx"); -+ } -+ if (mVWidth == 0) -+ { -+ mVWidth = 8; -+ } -+ } -+ else -+ { -+ hostCPUName = sys::getHostCPUName(); -+ if (mVWidth == 0) -+ { -+ mVWidth = 8; // 4? -+ } -+ } -+ -+ EB.setMCPU(hostCPUName); -+ -+#if defined(_WIN32) -+ // Needed for MCJIT on windows -+ Triple hostTriple(sys::getProcessTriple()); -+ hostTriple.setObjectFormat(Triple::ELF); -+ mpCurrentModule->setTargetTriple(hostTriple.getTriple()); -+#endif // _WIN32 -+ -+ mpExec = EB.create(); -+ -+#if LLVM_USE_INTEL_JITEVENTS -+ JITEventListener *vTune = JITEventListener::createIntelJITEventListener(); -+ mpExec->RegisterJITEventListener(vTune); -+#endif -+ -+ mFP32Ty = Type::getFloatTy(mContext); // float type -+ mInt8Ty = Type::getInt8Ty(mContext); -+ mInt32Ty = Type::getInt32Ty(mContext); // int type -+ mInt64Ty = Type::getInt64Ty(mContext); // int type -+ mV4FP32Ty = StructType::get(mContext, std::vector(4, mFP32Ty), false); // vector4 float type (represented as structure) -+ mV4Int32Ty = StructType::get(mContext, std::vector(4, mInt32Ty), false); // vector4 int type -+ -+ // fetch function signature -+ // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); -+ std::vector fsArgs; -+ fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0)); -+ fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0)); -+ -+ mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false); -+ -+ mSimtFP32Ty = VectorType::get(mFP32Ty, mVWidth); -+ mSimtInt32Ty = VectorType::get(mInt32Ty, mVWidth); -+ -+ mSimdVectorTy = StructType::get(mContext, std::vector(4, mSimtFP32Ty), false); -+ mSimdVectorInt32Ty = StructType::get(mContext, std::vector(4, mSimtInt32Ty), false); -+ -+#if defined(_WIN32) -+ // explicitly instantiate used symbols from potentially staticly linked libs -+ sys::DynamicLibrary::AddSymbol("exp2f", &exp2f); -+ sys::DynamicLibrary::AddSymbol("log2f", &log2f); -+ sys::DynamicLibrary::AddSymbol("sinf", &sinf); -+ sys::DynamicLibrary::AddSymbol("cosf", &cosf); -+ sys::DynamicLibrary::AddSymbol("powf", &powf); -+#endif -+ -+#if defined(_WIN32) -+ if (KNOB_DUMP_SHADER_IR) -+ { -+ CreateDirectory(INTEL_OUTPUT_DIR, NULL); -+ CreateDirectory(RASTY_OUTPUT_DIR, NULL); -+ CreateDirectory(JITTER_OUTPUT_DIR, NULL); -+ } -+#endif -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Create new LLVM module. -+void JitManager::SetupNewModule() -+{ -+ SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!"); -+ -+ std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate); -+ fnName << mJitNumber++; -+ std::unique_ptr newModule(new Module(fnName.str(), mContext)); -+ mpCurrentModule = newModule.get(); -+#if defined(_WIN32) -+ // Needed for MCJIT on windows -+ Triple hostTriple(sys::getProcessTriple()); -+ hostTriple.setObjectFormat(Triple::ELF); -+ newModule->setTargetTriple(hostTriple.getTriple()); -+#endif // _WIN32 -+ -+ mpExec->addModule(std::move(newModule)); -+ mIsModuleFinalized = false; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Create new LLVM module from IR. -+bool JitManager::SetupModuleFromIR(const uint8_t *pIR) -+{ -+ std::unique_ptr pMem = MemoryBuffer::getMemBuffer(StringRef((const char*)pIR), ""); -+ -+ SMDiagnostic Err; -+ std::unique_ptr newModule = parseIR(pMem.get()->getMemBufferRef(), Err, mContext); -+ -+ if (newModule == nullptr) -+ { -+ SWR_ASSERT(0, "Parse failed! Check Err for details."); -+ return false; -+ } -+ -+ mpCurrentModule = newModule.get(); -+#if defined(_WIN32) -+ // Needed for MCJIT on windows -+ Triple hostTriple(sys::getProcessTriple()); -+ hostTriple.setObjectFormat(Triple::ELF); -+ newModule->setTargetTriple(hostTriple.getTriple()); -+#endif // _WIN32 -+ -+ mpExec->addModule(std::move(newModule)); -+ mIsModuleFinalized = false; -+ -+ return true; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Dump function to file. -+void JitManager::DumpToFile(Function *f, const char *fileName) -+{ -+ if (KNOB_DUMP_SHADER_IR) -+ { -+#if defined(_WIN32) -+ DWORD pid = GetCurrentProcessId(); -+ TCHAR procname[MAX_PATH]; -+ GetModuleFileName(NULL, procname, MAX_PATH); -+ const char* pBaseName = strrchr(procname, '\\'); -+ std::stringstream outDir; -+ outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; -+ CreateDirectory(outDir.str().c_str(), NULL); -+#endif -+ -+ std::error_code EC; -+ const char *funcName = f->getName().data(); -+ char fName[256]; -+#if defined(_WIN32) -+ sprintf(fName, "%s\\%s.%s.ll", outDir.str().c_str(), funcName, fileName); -+#else -+ sprintf(fName, "%s.%s.ll", funcName, fileName); -+#endif -+ raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None); -+ Module* pModule = f->getParent(); -+ pModule->print(fd, nullptr); -+ fd.flush(); -+ } -+} -+ -+extern "C" -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Create JIT context. -+ /// @param simdWidth - SIMD width to be used in generated program. -+ HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch) -+ { -+ return new JitManager(targetSimdWidth, arch); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Destroy JIT context. -+ void JITCALL JitDestroyContext(HANDLE hJitContext) -+ { -+ delete reinterpret_cast(hJitContext); -+ } -+} -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h -new file mode 100644 -index 0000000..e0e8ec4 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h -@@ -0,0 +1,182 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file JitManager.h -+* -+* @brief JitManager contains the LLVM data structures used for JIT generation -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+ -+#include "common/os.h" -+#include "common/isa.hpp" -+ -+#if defined(_WIN32) -+#pragma warning(disable : 4146 4244 4267 4800 4996) -+#endif -+ -+#include "llvm/IR/DataLayout.h" -+#include "llvm/IR/Instructions.h" -+#include "llvm/IR/LLVMContext.h" -+#include "llvm/IR/Module.h" -+#include "llvm/IR/Type.h" -+#include "llvm/IR/IRBuilder.h" -+#include "llvm/IR/IntrinsicInst.h" -+ -+#include "llvm/Config/llvm-config.h" -+#ifndef LLVM_VERSION_MAJOR -+#include "llvm/Config/config.h" -+#endif -+ -+#include "llvm/IR/Verifier.h" -+#include "llvm/ExecutionEngine/MCJIT.h" -+#include "llvm/Support/FileSystem.h" -+#define LLVM_F_NONE sys::fs::F_None -+ -+#include "llvm/Analysis/Passes.h" -+#include "llvm/PassManager.h" -+#include "llvm/CodeGen/Passes.h" -+#include "llvm/ExecutionEngine/ExecutionEngine.h" -+#include "llvm/Support/raw_ostream.h" -+#include "llvm/Support/TargetSelect.h" -+#include "llvm/Transforms/IPO.h" -+#include "llvm/Transforms/Scalar.h" -+#include "llvm/Support/Host.h" -+ -+ -+using namespace llvm; -+////////////////////////////////////////////////////////////////////////// -+/// JitInstructionSet -+/// @brief Subclass of InstructionSet that allows users to override -+/// the reporting of support for certain ISA features. This allows capping -+/// the jitted code to a certain feature level, e.g. jit AVX level code on -+/// a platform that supports AVX2. -+////////////////////////////////////////////////////////////////////////// -+class JitInstructionSet : public InstructionSet -+{ -+public: -+ JitInstructionSet(const char* requestedIsa) : isaRequest(requestedIsa) -+ { -+ if (isaRequest == "") -+ { -+ // Check for an environment variable -+ const char* pIsaEnv = getenv("RASTY_KNOB_ARCH_STR"); -+ if (pIsaEnv) -+ { -+ isaRequest = pIsaEnv; -+ } -+ } -+ std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower); -+ -+ if(isaRequest == "avx") -+ { -+ bForceAVX = true; -+ bForceAVX2 = false; -+ bForceAVX512 = false; -+ } -+ else if(isaRequest == "avx2") -+ { -+ bForceAVX = false; -+ bForceAVX2 = true; -+ bForceAVX512 = false; -+ } -+ #if 0 -+ else if(isaRequest == "avx512") -+ { -+ bForceAVX = false; -+ bForceAVX2 = false; -+ bForceAVX512 = true; -+ } -+ #endif -+ }; -+ -+ bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); } -+ bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); } -+ bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); } -+ -+private: -+ bool bForceAVX = false; -+ bool bForceAVX2 = false; -+ bool bForceAVX512 = false; -+ std::string isaRequest; -+}; -+ -+ -+ -+struct JitLLVMContext : LLVMContext -+{ -+}; -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// JitManager -+////////////////////////////////////////////////////////////////////////// -+struct JitManager -+{ -+ JitManager(uint32_t w, const char *arch); -+ ~JitManager(){}; -+ -+ JitLLVMContext mContext; ///< LLVM compiler -+ IRBuilder<> mBuilder; ///< LLVM IR Builder -+ ExecutionEngine* mpExec; -+ -+ // Need to be rebuilt after a JIT and before building new IR -+ Module* mpCurrentModule; -+ bool mIsModuleFinalized; -+ uint32_t mJitNumber; -+ -+ uint32_t mVWidth; -+ -+ // Built in types. -+ Type* mInt8Ty; -+ Type* mInt32Ty; -+ Type* mInt64Ty; -+ Type* mFP32Ty; -+ StructType* mV4FP32Ty; -+ StructType* mV4Int32Ty; -+ -+ // helper scalar function types -+ FunctionType* mUnaryFPTy; -+ FunctionType* mBinaryFPTy; -+ FunctionType* mTrinaryFPTy; -+ FunctionType* mUnaryIntTy; -+ FunctionType* mBinaryIntTy; -+ FunctionType* mTrinaryIntTy; -+ -+ Type* mSimtFP32Ty; -+ Type* mSimtInt32Ty; -+ -+ Type* mSimdVectorInt32Ty; -+ Type* mSimdVectorTy; -+ -+ // fetch shader types -+ FunctionType* mFetchShaderTy; -+ -+ JitInstructionSet mArch; -+ -+ void SetupNewModule(); -+ bool SetupModuleFromIR(const uint8_t *pIR); -+ -+ static void DumpToFile(Function *f, const char *fileName); -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp -new file mode 100644 -index 0000000..5e8e5f4 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp -@@ -0,0 +1,473 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file blend_jit.cpp -+* -+* @brief Implementation of the blend jitter -+* -+* Notes: -+* -+******************************************************************************/ -+#include "jit_api.h" -+#include "blend_jit.h" -+#include "builder.h" -+#include "state_llvm.h" -+#include "common/containers.hpp" -+#include "llvm/IR/DataLayout.h" -+ -+#include -+ -+// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized -+#define QUANTIZE_THRESHOLD 2 -+ -+////////////////////////////////////////////////////////////////////////// -+/// Interface to Jitting a blend shader -+////////////////////////////////////////////////////////////////////////// -+struct BlendJit : public Builder -+{ -+ BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){}; -+ -+ template -+ void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4]) -+ { -+ Value* out[4]; -+ -+ switch (factor) -+ { -+ case BLENDFACTOR_ONE: -+ out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f); -+ break; -+ case BLENDFACTOR_SRC_COLOR: -+ out[0] = src[0]; -+ out[1] = src[1]; -+ out[2] = src[2]; -+ out[3] = src[3]; -+ break; -+ case BLENDFACTOR_SRC_ALPHA: -+ out[0] = out[1] = out[2] = out[3] = src[3]; -+ break; -+ case BLENDFACTOR_DST_ALPHA: -+ out[0] = out[1] = out[2] = out[3] = dst[3]; -+ break; -+ case BLENDFACTOR_DST_COLOR: -+ out[0] = dst[0]; -+ out[1] = dst[1]; -+ out[2] = dst[2]; -+ out[3] = dst[3]; -+ break; -+ case BLENDFACTOR_SRC_ALPHA_SATURATE: -+ out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3])); -+ out[3] = VIMMED1(1.0f); -+ break; -+ case BLENDFACTOR_CONST_COLOR: -+ out[0] = constColor[0]; -+ out[1] = constColor[1]; -+ out[2] = constColor[2]; -+ out[3] = constColor[3]; -+ break; -+ case BLENDFACTOR_CONST_ALPHA: -+ out[0] = out[1] = out[2] = out[3] = constColor[3]; -+ break; -+ case BLENDFACTOR_SRC1_COLOR: -+ out[0] = src1[0]; -+ out[1] = src1[1]; -+ out[2] = src1[2]; -+ out[3] = src1[3]; -+ break; -+ case BLENDFACTOR_SRC1_ALPHA: -+ out[0] = out[1] = out[2] = out[3] = src1[3]; -+ break; -+ case BLENDFACTOR_ZERO: -+ out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); -+ break; -+ case BLENDFACTOR_INV_SRC_COLOR: -+ out[0] = FSUB(VIMMED1(1.0f), src[0]); -+ out[1] = FSUB(VIMMED1(1.0f), src[1]); -+ out[2] = FSUB(VIMMED1(1.0f), src[2]); -+ out[3] = FSUB(VIMMED1(1.0f), src[3]); -+ break; -+ case BLENDFACTOR_INV_SRC_ALPHA: -+ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]); -+ break; -+ case BLENDFACTOR_INV_DST_ALPHA: -+ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]); -+ break; -+ case BLENDFACTOR_INV_DST_COLOR: -+ out[0] = FSUB(VIMMED1(1.0f), dst[0]); -+ out[1] = FSUB(VIMMED1(1.0f), dst[1]); -+ out[2] = FSUB(VIMMED1(1.0f), dst[2]); -+ out[3] = FSUB(VIMMED1(1.0f), dst[3]); -+ break; -+ case BLENDFACTOR_INV_CONST_COLOR: -+ out[0] = FSUB(VIMMED1(1.0f), constColor[0]); -+ out[1] = FSUB(VIMMED1(1.0f), constColor[1]); -+ out[2] = FSUB(VIMMED1(1.0f), constColor[2]); -+ out[3] = FSUB(VIMMED1(1.0f), constColor[3]); -+ break; -+ case BLENDFACTOR_INV_CONST_ALPHA: -+ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]); -+ break; -+ case BLENDFACTOR_INV_SRC1_COLOR: -+ out[0] = FSUB(VIMMED1(1.0f), src1[0]); -+ out[1] = FSUB(VIMMED1(1.0f), src1[1]); -+ out[2] = FSUB(VIMMED1(1.0f), src1[2]); -+ out[3] = FSUB(VIMMED1(1.0f), src1[3]); -+ break; -+ case BLENDFACTOR_INV_SRC1_ALPHA: -+ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]); -+ break; -+ default: -+ SWR_ASSERT(false, "Unsupported blend factor: %d", factor); -+ out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); -+ break; -+ } -+ -+ if (Color) -+ { -+ result[0] = out[0]; -+ result[1] = out[1]; -+ result[2] = out[2]; -+ } -+ -+ if (Alpha) -+ { -+ result[3] = out[3]; -+ } -+ } -+ -+ void Clamp(SWR_FORMAT format, Value* src[4]) -+ { -+ const SWR_FORMAT_INFO& info = GetFormatInfo(format); -+ SWR_TYPE type = info.type[0]; -+ -+ switch (type) -+ { -+ case SWR_TYPE_FLOAT: -+ break; -+ -+ case SWR_TYPE_UNORM: -+ src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f)); -+ src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f)); -+ src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f)); -+ src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f)); -+ break; -+ -+ case SWR_TYPE_SNORM: -+ src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f)); -+ src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f)); -+ src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f)); -+ src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f)); -+ break; -+ -+ default: SWR_ASSERT(false, "Unsupport format type: %d", type); -+ } -+ } -+ -+ void ApplyDefaults(SWR_FORMAT format, Value* src[4]) -+ { -+ const SWR_FORMAT_INFO& info = GetFormatInfo(format); -+ -+ bool valid[] = { false, false, false, false }; -+ for (uint32_t c = 0; c < info.numComps; ++c) -+ { -+ valid[info.swizzle[c]] = true; -+ } -+ -+ for (uint32_t c = 0; c < 4; ++c) -+ { -+ if (!valid[c]) -+ { -+ src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty); -+ } -+ } -+ } -+ -+ void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4]) -+ { -+ const SWR_FORMAT_INFO& info = GetFormatInfo(format); -+ -+ for (uint32_t c = 0; c < info.numComps; ++c) -+ { -+ if (info.type[c] == SWR_TYPE_UNUSED) -+ { -+ src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty); -+ } -+ } -+ } -+ -+ void Quantize(SWR_FORMAT format, Value* src[4]) -+ { -+ const SWR_FORMAT_INFO& info = GetFormatInfo(format); -+ for (uint32_t c = 0; c < info.numComps; ++c) -+ { -+ if (info.bpc[c] <= QUANTIZE_THRESHOLD) -+ { -+ uint32_t swizComp = info.swizzle[c]; -+ float factor = (float)((1 << info.bpc[c]) - 1); -+ switch (info.type[c]) -+ { -+ case SWR_TYPE_UNORM: -+ src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f)); -+ src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO)); -+ src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor)); -+ break; -+ default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]); -+ } -+ } -+ } -+ } -+ -+ template -+ void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4]) -+ { -+ Value* out[4]; -+ Value* srcBlend[4]; -+ Value* dstBlend[4]; -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ srcBlend[i] = FMUL(src[i], srcFactor[i]); -+ dstBlend[i] = FMUL(dst[i], dstFactor[i]); -+ } -+ -+ switch (blendOp) -+ { -+ case BLENDOP_ADD: -+ out[0] = FADD(srcBlend[0], dstBlend[0]); -+ out[1] = FADD(srcBlend[1], dstBlend[1]); -+ out[2] = FADD(srcBlend[2], dstBlend[2]); -+ out[3] = FADD(srcBlend[3], dstBlend[3]); -+ break; -+ -+ case BLENDOP_SUBTRACT: -+ out[0] = FSUB(srcBlend[0], dstBlend[0]); -+ out[1] = FSUB(srcBlend[1], dstBlend[1]); -+ out[2] = FSUB(srcBlend[2], dstBlend[2]); -+ out[3] = FSUB(srcBlend[3], dstBlend[3]); -+ break; -+ -+ case BLENDOP_REVSUBTRACT: -+ out[0] = FSUB(dstBlend[0], srcBlend[0]); -+ out[1] = FSUB(dstBlend[1], srcBlend[1]); -+ out[2] = FSUB(dstBlend[2], srcBlend[2]); -+ out[3] = FSUB(dstBlend[3], srcBlend[3]); -+ break; -+ -+ case BLENDOP_MIN: -+ out[0] = VMINPS(src[0], dst[0]); -+ out[1] = VMINPS(src[1], dst[1]); -+ out[2] = VMINPS(src[2], dst[2]); -+ out[3] = VMINPS(src[3], dst[3]); -+ break; -+ -+ case BLENDOP_MAX: -+ out[0] = VMAXPS(src[0], dst[0]); -+ out[1] = VMAXPS(src[1], dst[1]); -+ out[2] = VMAXPS(src[2], dst[2]); -+ out[3] = VMAXPS(src[3], dst[3]); -+ break; -+ -+ default: -+ SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp); -+ out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); -+ break; -+ } -+ -+ if (Color) -+ { -+ result[0] = out[0]; -+ result[1] = out[1]; -+ result[2] = out[2]; -+ } -+ -+ if (Alpha) -+ { -+ result[3] = out[3]; -+ } -+ } -+ -+ Function* Create(const BLEND_COMPILE_STATE& state) -+ { -+ static std::size_t jitNum = 0; -+ -+ std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); -+ fnName << jitNum++; -+ -+ // blend function signature -+ // typedef void(*PFN_BLEND_JIT_FUNC)(SWR_BLEND_STATE*, simdvector&, simdvector&, uint8_t*, simdvector&); -+ -+ std::vector args{ -+ PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE* -+ PointerType::get(mSimdFP32Ty, 0), // simdvector& src -+ PointerType::get(mSimdFP32Ty, 0), // simdvector& src1 -+ PointerType::get(mSimdFP32Ty, 0), // uint8_t* pDst -+ PointerType::get(mSimdFP32Ty, 0), // simdvector& result -+ }; -+ -+ FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); -+ Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); -+ -+ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc); -+ -+ IRB()->SetInsertPoint(entry); -+ -+ // arguments -+ auto argitr = blendFunc->getArgumentList().begin(); -+ Value* pBlendState = argitr++; -+ pBlendState->setName("pBlendState"); -+ Value* pSrc = argitr++; -+ pSrc->setName("src"); -+ Value* pSrc1 = argitr++; -+ pSrc1->setName("src1"); -+ Value* pDst = argitr++; -+ pDst->setName("pDst"); -+ Value* pResult = argitr++; -+ pResult->setName("result"); -+ -+ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); -+ Value* dst[4]; -+ Value* constantColor[4]; -+ Value* src[4]; -+ Value* src1[4]; -+ Value* result[4]; -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ // load hot tile -+ dst[i] = LOAD(pDst, { i }); -+ -+ // load constant color -+ constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i })); -+ -+ // load src -+ src[i] = LOAD(pSrc, { i }); -+ -+ // load src1 -+ src1[i] = LOAD(pSrc1, { i }); -+ } -+ -+ // clamp sources -+ Clamp(state.format, src); -+ Clamp(state.format, src1); -+ Clamp(state.format, dst); -+ Clamp(state.format, constantColor); -+ -+ // apply defaults to hottile contents to take into account missing components -+ ApplyDefaults(state.format, dst); -+ -+ // Force defaults for unused 'X' components -+ ApplyUnusedDefaults(state.format, dst); -+ -+ // Quantize low precision components -+ Quantize(state.format, dst); -+ -+ // special case clamping for R11G11B10_float which has no sign bit -+ if (state.format == R11G11B10_FLOAT) -+ { -+ dst[0] = VMAXPS(dst[0], VIMMED1(0.0f)); -+ dst[1] = VMAXPS(dst[1], VIMMED1(0.0f)); -+ dst[2] = VMAXPS(dst[2], VIMMED1(0.0f)); -+ dst[3] = VMAXPS(dst[3], VIMMED1(0.0f)); -+ } -+ -+ Value* srcFactor[4]; -+ Value* dstFactor[4]; -+ if (state.independentAlphaBlendEnable) -+ { -+ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); -+ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor); -+ -+ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); -+ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor); -+ -+ BlendFunc((SWR_BLEND_OP)state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); -+ BlendFunc((SWR_BLEND_OP)state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result); -+ } -+ else -+ { -+ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); -+ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); -+ -+ BlendFunc((SWR_BLEND_OP)state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); -+ } -+ -+ // store results out -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ STORE(result[i], pResult, { i }); -+ } -+ -+ RET_VOID(); -+ -+ JitManager::DumpToFile(blendFunc, ""); -+ -+ FunctionPassManager passes(JM()->mpCurrentModule); -+ passes.add(createBreakCriticalEdgesPass()); -+ passes.add(createCFGSimplificationPass()); -+ passes.add(createEarlyCSEPass()); -+ passes.add(createPromoteMemoryToRegisterPass()); -+ passes.add(createCFGSimplificationPass()); -+ passes.add(createEarlyCSEPass()); -+ passes.add(createInstructionCombiningPass()); -+ passes.add(createInstructionSimplifierPass()); -+ passes.add(createConstantPropagationPass()); -+ passes.add(createSCCPPass()); -+ passes.add(createAggressiveDCEPass()); -+ -+ passes.run(*blendFunc); -+ -+ JitManager::DumpToFile(blendFunc, "optimized"); -+ -+ return blendFunc; -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JITs from fetch shader IR -+/// @param hJitMgr - JitManager handle -+/// @param func - LLVM function IR -+/// @return PFN_FETCH_FUNC - pointer to fetch code -+PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc) -+{ -+ const llvm::Function *func = (const llvm::Function*)hFunc; -+ JitManager* pJitMgr = reinterpret_cast(hJitMgr); -+ PFN_BLEND_JIT_FUNC pfnBlend; -+ pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); -+ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module -+ pJitMgr->mIsModuleFinalized = true; -+ -+ return pfnBlend; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JIT compiles blend shader -+/// @param hJitMgr - JitManager handle -+/// @param state - blend state to build function from -+extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state) -+{ -+ JitManager* pJitMgr = reinterpret_cast(hJitMgr); -+ -+ pJitMgr->SetupNewModule(); -+ -+ BlendJit theJit(pJitMgr); -+ HANDLE hFunc = theJit.Create(state); -+ -+ return JitBlendFunc(hJitMgr, hFunc); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h -new file mode 100644 -index 0000000..80c4c03 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h -@@ -0,0 +1,49 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file blend_jit.h -+* -+* @brief Definition of the blend jitter -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+ -+#include "common/formats.h" -+#include "core/context.h" -+#include "core/state.h" -+ -+////////////////////////////////////////////////////////////////////////// -+/// State required for blend jit -+////////////////////////////////////////////////////////////////////////// -+struct BLEND_COMPILE_STATE -+{ -+ SWR_FORMAT format; // format of render target being blended -+ bool independentAlphaBlendEnable; -+ SWR_RENDER_TARGET_BLEND_STATE blendState; -+ -+ bool operator==(const BLEND_COMPILE_STATE& other) const -+ { -+ return memcmp(this, &other, sizeof(BLEND_COMPILE_STATE)) == 0; -+ } -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp -new file mode 100644 -index 0000000..b971791 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp -@@ -0,0 +1,56 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file builder.h -+* -+* @brief Includes all the builder related functionality -+* -+* Notes: -+* -+******************************************************************************/ -+ -+#include "builder.h" -+ -+using namespace llvm; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Contructor for Builder. -+/// @param pJitMgr - JitManager which contains modules, function passes, etc. -+Builder::Builder(JitManager *pJitMgr) -+ : mpJitMgr(pJitMgr) -+{ -+ mpIRBuilder = &pJitMgr->mBuilder; -+ -+ mFP16Ty = Type::getHalfTy(pJitMgr->mContext); -+ mFP32Ty = Type::getFloatTy(pJitMgr->mContext); -+ mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); -+ mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); -+ mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); -+ mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); -+ mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector(4, mFP32Ty), false); // vector4 float type (represented as structure) -+ mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector(4, mInt32Ty), false); // vector4 int type -+ mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth); -+ mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth); -+ mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth); -+ mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth); -+ mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h -new file mode 100644 -index 0000000..1342f28 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h -@@ -0,0 +1,66 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file builder.h -+* -+* @brief Includes all the builder related functionality -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+ -+#include "JitManager.h" -+#include "common/formats.h" -+ -+using namespace llvm; -+ -+struct Builder -+{ -+ Builder(JitManager *pJitMgr); -+ IRBuilder<>* IRB() { return mpIRBuilder; }; -+ JitManager* JM() { return mpJitMgr; } -+ -+ JitManager* mpJitMgr; -+ IRBuilder<>* mpIRBuilder; -+ -+ // Built in types. -+ Type* mInt8Ty; -+ Type* mInt16Ty; -+ Type* mInt32Ty; -+ Type* mInt64Ty; -+ Type* mFP16Ty; -+ Type* mFP32Ty; -+ Type* mSimdFP16Ty; -+ Type* mSimdFP32Ty; -+ Type* mSimdInt16Ty; -+ Type* mSimdInt32Ty; -+ Type* mSimdInt64Ty; -+ StructType* mV4FP32Ty; -+ StructType* mV4Int32Ty; -+ -+#include "builder_gen.h" -+#include "builder_x86.h" -+#include "builder_misc.h" -+#include "builder_math.h" -+ -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp -new file mode 100644 -index 0000000..7b5ef20 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp -@@ -0,0 +1,1052 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file builder_gen.cpp -+* -+* @brief auto-generated file -+* -+* DO NOT EDIT -+* -+******************************************************************************/ -+ -+#include "builder.h" -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::GLOBAL_STRING(StringRef Str, const Twine &Name) -+{ -+ return IRB()->CreateGlobalString(Str, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::MEMSET(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) -+{ -+ return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::MEMSET(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) -+{ -+ return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::MEMCPY(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) -+{ -+ return IRB()->CreateMemCpy(Dst, Src, Size, Align, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::MEMCPY(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) -+{ -+ return IRB()->CreateMemCpy(Dst, Src, Size, Align, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::MEMMOVE(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) -+{ -+ return IRB()->CreateMemMove(Dst, Src, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::MEMMOVE(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) -+{ -+ return IRB()->CreateMemMove(Dst, Src, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::LIFETIME_START(Value *Ptr, ConstantInt *Size) -+{ -+ return IRB()->CreateLifetimeStart(Ptr, Size); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::LIFETIME_END(Value *Ptr, ConstantInt *Size) -+{ -+ return IRB()->CreateLifetimeEnd(Ptr, Size); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru, const Twine &Name) -+{ -+ return IRB()->CreateMaskedLoad(Ptr, Align, Mask, PassThru, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask) -+{ -+ return IRB()->CreateMaskedStore(Val, Ptr, Align, Mask); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::ASSUMPTION(Value *Cond) -+{ -+ return IRB()->CreateAssumption(Cond); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::GC_STATEPOINT(Value *ActualCallee, ArrayRef CallArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name) -+{ -+ return IRB()->CreateGCStatepoint(ActualCallee, CallArgs, DeoptArgs, GCArgs, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::GC_RESULT(Instruction *Statepoint, Type *ResultType, const Twine &Name) -+{ -+ return IRB()->CreateGCResult(Statepoint, ResultType, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::GC_RELOCATE(Instruction *Statepoint, int BaseOffset, int DerivedOffset, Type *ResultType, const Twine &Name) -+{ -+ return IRB()->CreateGCRelocate(Statepoint, BaseOffset, DerivedOffset, ResultType, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+ReturnInst *Builder::RET_VOID() -+{ -+ return IRB()->CreateRetVoid(); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+ReturnInst *Builder::RET(Value *V) -+{ -+ return IRB()->CreateRet(V); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+ReturnInst *Builder::AGGREGATE_RET(Value *const *retVals, unsigned N) -+{ -+ return IRB()->CreateAggregateRet(retVals, N); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+BranchInst *Builder::BR(BasicBlock *Dest) -+{ -+ return IRB()->CreateBr(Dest); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+BranchInst *Builder::COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights) -+{ -+ return IRB()->CreateCondBr(Cond, True, False, BranchWeights); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+SwitchInst *Builder::SWITCH(Value *V, BasicBlock *Dest, unsigned NumCases, MDNode *BranchWeights) -+{ -+ return IRB()->CreateSwitch(V, Dest, NumCases, BranchWeights); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+IndirectBrInst *Builder::INDIRECT_BR(Value *Addr, unsigned NumDests) -+{ -+ return IRB()->CreateIndirectBr(Addr, NumDests); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+InvokeInst *Builder::INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, const Twine &Name) -+{ -+ return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+InvokeInst *Builder::INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, const Twine &Name) -+{ -+ return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Arg1, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+InvokeInst *Builder::INVOKE3(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name) -+{ -+ return IRB()->CreateInvoke3(Callee, NormalDest, UnwindDest, Arg1, Arg2, Arg3, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+InvokeInst *Builder::INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef Args, const Twine &Name) -+{ -+ return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Args, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+ResumeInst *Builder::RESUME(Value *Exn) -+{ -+ return IRB()->CreateResume(Exn); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+UnreachableInst *Builder::UNREACHABLE() -+{ -+ return IRB()->CreateUnreachable(); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ADD(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW) -+{ -+ return IRB()->CreateAdd(LHS, RHS, Name, HasNUW, HasNSW); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NSW_ADD(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateNSWAdd(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NUW_ADD(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateNUWAdd(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FADD(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) -+{ -+ return IRB()->CreateFAdd(LHS, RHS, Name, FPMathTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::SUB(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW) -+{ -+ return IRB()->CreateSub(LHS, RHS, Name, HasNUW, HasNSW); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NSW_SUB(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateNSWSub(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NUW_SUB(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateNUWSub(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FSUB(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) -+{ -+ return IRB()->CreateFSub(LHS, RHS, Name, FPMathTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::MUL(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW) -+{ -+ return IRB()->CreateMul(LHS, RHS, Name, HasNUW, HasNSW); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NSW_MUL(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateNSWMul(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NUW_MUL(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateNUWMul(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FMUL(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) -+{ -+ return IRB()->CreateFMul(LHS, RHS, Name, FPMathTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::UDIV(Value *LHS, Value *RHS, const Twine &Name, bool isExact) -+{ -+ return IRB()->CreateUDiv(LHS, RHS, Name, isExact); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::EXACT_U_DIV(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateExactUDiv(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::SDIV(Value *LHS, Value *RHS, const Twine &Name, bool isExact) -+{ -+ return IRB()->CreateSDiv(LHS, RHS, Name, isExact); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::EXACT_S_DIV(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateExactSDiv(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FDIV(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) -+{ -+ return IRB()->CreateFDiv(LHS, RHS, Name, FPMathTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::UREM(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateURem(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::SREM(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateSRem(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FREM(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) -+{ -+ return IRB()->CreateFRem(LHS, RHS, Name, FPMathTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::SHL(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW) -+{ -+ return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::SHL(Value *LHS, const APInt &RHS, const Twine &Name, bool HasNUW, bool HasNSW) -+{ -+ return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::SHL(Value *LHS, uint64_t RHS, const Twine &Name, bool HasNUW, bool HasNSW) -+{ -+ return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::LSHR(Value *LHS, Value *RHS, const Twine &Name, bool isExact) -+{ -+ return IRB()->CreateLShr(LHS, RHS, Name, isExact); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::LSHR(Value *LHS, const APInt &RHS, const Twine &Name, bool isExact) -+{ -+ return IRB()->CreateLShr(LHS, RHS, Name, isExact); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::LSHR(Value *LHS, uint64_t RHS, const Twine &Name, bool isExact) -+{ -+ return IRB()->CreateLShr(LHS, RHS, Name, isExact); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ASHR(Value *LHS, Value *RHS, const Twine &Name, bool isExact) -+{ -+ return IRB()->CreateAShr(LHS, RHS, Name, isExact); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ASHR(Value *LHS, const APInt &RHS, const Twine &Name, bool isExact) -+{ -+ return IRB()->CreateAShr(LHS, RHS, Name, isExact); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ASHR(Value *LHS, uint64_t RHS, const Twine &Name, bool isExact) -+{ -+ return IRB()->CreateAShr(LHS, RHS, Name, isExact); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::AND(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateAnd(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::AND(Value *LHS, const APInt &RHS, const Twine &Name) -+{ -+ return IRB()->CreateAnd(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::AND(Value *LHS, uint64_t RHS, const Twine &Name) -+{ -+ return IRB()->CreateAnd(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::OR(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateOr(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::OR(Value *LHS, const APInt &RHS, const Twine &Name) -+{ -+ return IRB()->CreateOr(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::OR(Value *LHS, uint64_t RHS, const Twine &Name) -+{ -+ return IRB()->CreateOr(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::XOR(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateXor(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::XOR(Value *LHS, const APInt &RHS, const Twine &Name) -+{ -+ return IRB()->CreateXor(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::XOR(Value *LHS, uint64_t RHS, const Twine &Name) -+{ -+ return IRB()->CreateXor(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::BINOP(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) -+{ -+ return IRB()->CreateBinOp(Opc, LHS, RHS, Name, FPMathTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NEG(Value *V, const Twine &Name, bool HasNUW, bool HasNSW) -+{ -+ return IRB()->CreateNeg(V, Name, HasNUW, HasNSW); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NSW_NEG(Value *V, const Twine &Name) -+{ -+ return IRB()->CreateNSWNeg(V, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NUW_NEG(Value *V, const Twine &Name) -+{ -+ return IRB()->CreateNUWNeg(V, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FNEG(Value *V, const Twine &Name, MDNode *FPMathTag) -+{ -+ return IRB()->CreateFNeg(V, Name, FPMathTag); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::NOT(Value *V, const Twine &Name) -+{ -+ return IRB()->CreateNot(V, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+AllocaInst *Builder::ALLOCA(Type *Ty, Value *ArraySize, const Twine &Name) -+{ -+ return IRB()->CreateAlloca(Ty, ArraySize, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+LoadInst *Builder::LOAD(Value *Ptr, const char *Name) -+{ -+ return IRB()->CreateLoad(Ptr, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+LoadInst *Builder::LOAD(Value *Ptr, const Twine &Name) -+{ -+ return IRB()->CreateLoad(Ptr, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+LoadInst *Builder::LOAD(Value *Ptr, bool isVolatile, const Twine &Name) -+{ -+ return IRB()->CreateLoad(Ptr, isVolatile, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+StoreInst *Builder::STORE(Value *Val, Value *Ptr, bool isVolatile) -+{ -+ return IRB()->CreateStore(Val, Ptr, isVolatile); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+LoadInst *Builder::ALIGNED_LOAD(Value *Ptr, unsigned Align, const char *Name) -+{ -+ return IRB()->CreateAlignedLoad(Ptr, Align, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+LoadInst *Builder::ALIGNED_LOAD(Value *Ptr, unsigned Align, const Twine &Name) -+{ -+ return IRB()->CreateAlignedLoad(Ptr, Align, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+LoadInst *Builder::ALIGNED_LOAD(Value *Ptr, unsigned Align, bool isVolatile, const Twine &Name) -+{ -+ return IRB()->CreateAlignedLoad(Ptr, Align, isVolatile, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+StoreInst *Builder::ALIGNED_STORE(Value *Val, Value *Ptr, unsigned Align, bool isVolatile) -+{ -+ return IRB()->CreateAlignedStore(Val, Ptr, Align, isVolatile); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+FenceInst *Builder::FENCE(AtomicOrdering Ordering, SynchronizationScope SynchScope, const Twine &Name) -+{ -+ return IRB()->CreateFence(Ordering, SynchScope, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+AtomicCmpXchgInst *Builder::ATOMIC_CMP_XCHG(Value *Ptr, Value *Cmp, Value *New, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) -+{ -+ return IRB()->CreateAtomicCmpXchg(Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SynchScope); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+AtomicRMWInst *Builder::ATOMIC_RMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, AtomicOrdering Ordering, SynchronizationScope SynchScope) -+{ -+ return IRB()->CreateAtomicRMW(Op, Ptr, Val, Ordering, SynchScope); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::GEPA(Value *Ptr, ArrayRef IdxList, const Twine &Name) -+{ -+ return IRB()->CreateGEP(Ptr, IdxList, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::IN_BOUNDS_GEP(Value *Ptr, ArrayRef IdxList, const Twine &Name) -+{ -+ return IRB()->CreateInBoundsGEP(Ptr, IdxList, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::GEP(Value *Ptr, Value *Idx, const Twine &Name) -+{ -+ return IRB()->CreateGEP(Ptr, Idx, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::IN_BOUNDS_GEP(Value *Ptr, Value *Idx, const Twine &Name) -+{ -+ return IRB()->CreateInBoundsGEP(Ptr, Idx, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::CONST_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name) -+{ -+ return IRB()->CreateConstGEP1_32(Ptr, Idx0, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::CONST_IN_BOUNDS_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name) -+{ -+ return IRB()->CreateConstInBoundsGEP1_32(Ptr, Idx0, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::CONST_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name) -+{ -+ return IRB()->CreateConstGEP2_32(Ptr, Idx0, Idx1, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::CONST_IN_BOUNDS_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name) -+{ -+ return IRB()->CreateConstInBoundsGEP2_32(Ptr, Idx0, Idx1, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::CONST_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name) -+{ -+ return IRB()->CreateConstGEP1_64(Ptr, Idx0, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::CONST_IN_BOUNDS_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name) -+{ -+ return IRB()->CreateConstInBoundsGEP1_64(Ptr, Idx0, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::CONST_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name) -+{ -+ return IRB()->CreateConstGEP2_64(Ptr, Idx0, Idx1, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::CONST_IN_BOUNDS_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name) -+{ -+ return IRB()->CreateConstInBoundsGEP2_64(Ptr, Idx0, Idx1, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::STRUCT_GEP(Value *Ptr, unsigned Idx, const Twine &Name) -+{ -+ return IRB()->CreateStructGEP(Ptr, Idx, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::GLOBAL_STRING_PTR(StringRef Str, const Twine &Name) -+{ -+ return IRB()->CreateGlobalStringPtr(Str, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::TRUNC(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateTrunc(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::Z_EXT(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateZExt(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::S_EXT(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateSExt(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::Z_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateZExtOrTrunc(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::S_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateSExtOrTrunc(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FP_TO_UI(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateFPToUI(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FP_TO_SI(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateFPToSI(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::UI_TO_FP(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateUIToFP(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::SI_TO_FP(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateSIToFP(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FP_TRUNC(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateFPTrunc(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FP_EXT(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateFPExt(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::PTR_TO_INT(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreatePtrToInt(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::INT_TO_PTR(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateIntToPtr(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::BITCAST(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateBitCast(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateAddrSpaceCast(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::Z_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateZExtOrBitCast(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::S_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateSExtOrBitCast(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::TRUNC_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateTruncOrBitCast(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::CAST(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateCast(Op, V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::POINTER_CAST(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreatePointerCast(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::POINTER_BIT_CAST_OR_ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreatePointerBitCastOrAddrSpaceCast(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::INT_CAST(Value *V, Type *DestTy, bool isSigned, const Twine &Name) -+{ -+ return IRB()->CreateIntCast(V, DestTy, isSigned, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::BIT_OR_POINTER_CAST(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateBitOrPointerCast(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FP_CAST(Value *V, Type *DestTy, const Twine &Name) -+{ -+ return IRB()->CreateFPCast(V, DestTy, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_EQ(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpEQ(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_NE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpNE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_UGT(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpUGT(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_UGE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpUGE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_ULT(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpULT(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_ULE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpULE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_SGT(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpSGT(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_SGE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpSGE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_SLT(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpSLT(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP_SLE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmpSLE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_OEQ(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpOEQ(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_OGT(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpOGT(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_OGE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpOGE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_OLT(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpOLT(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_OLE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpOLE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_ONE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpONE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_ORD(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpORD(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_UNO(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpUNO(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_UEQ(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpUEQ(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_UGT(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpUGT(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_UGE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpUGE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_ULT(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpULT(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_ULE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpULE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP_UNE(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmpUNE(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::ICMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateICmp(P, LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::FCMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreateFCmp(P, LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+PHINode *Builder::PHI(Type *Ty, unsigned NumReservedValues, const Twine &Name) -+{ -+ return IRB()->CreatePHI(Ty, NumReservedValues, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::CALL(Value *Callee, const Twine &Name) -+{ -+ return IRB()->CreateCall(Callee, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::CALL(Value *Callee, Value *Arg, const Twine &Name) -+{ -+ return IRB()->CreateCall(Callee, Arg, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::CALL2(Value *Callee, Value *Arg1, Value *Arg2, const Twine &Name) -+{ -+ return IRB()->CreateCall2(Callee, Arg1, Arg2, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::CALL3(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name) -+{ -+ return IRB()->CreateCall3(Callee, Arg1, Arg2, Arg3, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::CALL4(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, const Twine &Name) -+{ -+ return IRB()->CreateCall4(Callee, Arg1, Arg2, Arg3, Arg4, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::CALL5(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, Value *Arg5, const Twine &Name) -+{ -+ return IRB()->CreateCall5(Callee, Arg1, Arg2, Arg3, Arg4, Arg5, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::CALLA(Value *Callee, ArrayRef Args, const Twine &Name) -+{ -+ return IRB()->CreateCall(Callee, Args, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::SELECT(Value *C, Value *True, Value *False, const Twine &Name) -+{ -+ return IRB()->CreateSelect(C, True, False, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+VAArgInst *Builder::VA_ARG(Value *List, Type *Ty, const Twine &Name) -+{ -+ return IRB()->CreateVAArg(List, Ty, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VEXTRACT(Value *Vec, Value *Idx, const Twine &Name) -+{ -+ return IRB()->CreateExtractElement(Vec, Idx, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VINSERT(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name) -+{ -+ return IRB()->CreateInsertElement(Vec, NewElt, Idx, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VSHUFFLE(Value *V1, Value *V2, Value *Mask, const Twine &Name) -+{ -+ return IRB()->CreateShuffleVector(V1, V2, Mask, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::EXTRACT_VALUE(Value *Agg, ArrayRef Idxs, const Twine &Name) -+{ -+ return IRB()->CreateExtractValue(Agg, Idxs, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::INSERT_VALUE(Value *Agg, Value *Val, ArrayRef Idxs, const Twine &Name) -+{ -+ return IRB()->CreateInsertValue(Agg, Val, Idxs, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+LandingPadInst *Builder::LANDING_PAD(Type *Ty, Value *PersFn, unsigned NumClauses, const Twine &Name) -+{ -+ return IRB()->CreateLandingPad(Ty, PersFn, NumClauses, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::IS_NULL(Value *Arg, const Twine &Name) -+{ -+ return IRB()->CreateIsNull(Arg, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::IS_NOT_NULL(Value *Arg, const Twine &Name) -+{ -+ return IRB()->CreateIsNotNull(Arg, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::PTR_DIFF(Value *LHS, Value *RHS, const Twine &Name) -+{ -+ return IRB()->CreatePtrDiff(LHS, RHS, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VECTOR_SPLAT(unsigned NumElts, Value *V, const Twine &Name) -+{ -+ return IRB()->CreateVectorSplat(NumElts, V, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::EXTRACT_INTEGER(const DataLayout &DL, Value *From, IntegerType *ExtractedTy, uint64_t Offset, const Twine &Name) -+{ -+ return IRB()->CreateExtractInteger(DL, From, ExtractedTy, Offset, Name); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+CallInst *Builder::ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue) -+{ -+ return IRB()->CreateAlignmentAssumption(DL, PtrValue, Alignment, OffsetValue); -+} -+ -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h -new file mode 100644 -index 0000000..c39077c ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h -@@ -0,0 +1,205 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file builder_gen.h -+* -+* @brief auto-generated file -+* -+* DO NOT EDIT -+* -+******************************************************************************/ -+ -+#pragma once -+ -+////////////////////////////////////////////////////////////////////////// -+/// Auto-generated Builder IR declarations -+////////////////////////////////////////////////////////////////////////// -+Value *GLOBAL_STRING(StringRef Str, const Twine &Name = ""); -+CallInst *MEMSET(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); -+CallInst *MEMSET(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); -+CallInst *MEMCPY(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); -+CallInst *MEMCPY(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); -+CallInst *MEMMOVE(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); -+CallInst *MEMMOVE(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); -+CallInst *LIFETIME_START(Value *Ptr, ConstantInt *Size = nullptr); -+CallInst *LIFETIME_END(Value *Ptr, ConstantInt *Size = nullptr); -+CallInst *MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru = 0, const Twine &Name = ""); -+CallInst *MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask); -+CallInst *ASSUMPTION(Value *Cond); -+CallInst *GC_STATEPOINT(Value *ActualCallee, ArrayRef CallArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name = ""); -+CallInst *GC_RESULT(Instruction *Statepoint, Type *ResultType, const Twine &Name = ""); -+CallInst *GC_RELOCATE(Instruction *Statepoint, int BaseOffset, int DerivedOffset, Type *ResultType, const Twine &Name = ""); -+ReturnInst *RET_VOID(); -+ReturnInst *RET(Value *V); -+ReturnInst *AGGREGATE_RET(Value *const *retVals, unsigned N); -+BranchInst *BR(BasicBlock *Dest); -+BranchInst *COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights = nullptr); -+SwitchInst *SWITCH(Value *V, BasicBlock *Dest, unsigned NumCases = 10, MDNode *BranchWeights = nullptr); -+IndirectBrInst *INDIRECT_BR(Value *Addr, unsigned NumDests = 10); -+InvokeInst *INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, const Twine &Name = ""); -+InvokeInst *INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, const Twine &Name = ""); -+InvokeInst *INVOKE3(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name = ""); -+InvokeInst *INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef Args, const Twine &Name = ""); -+ResumeInst *RESUME(Value *Exn); -+UnreachableInst *UNREACHABLE(); -+Value *ADD(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); -+Value *NSW_ADD(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *NUW_ADD(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FADD(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); -+Value *SUB(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); -+Value *NSW_SUB(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *NUW_SUB(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FSUB(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); -+Value *MUL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); -+Value *NSW_MUL(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *NUW_MUL(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FMUL(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); -+Value *UDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false); -+Value *EXACT_U_DIV(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *SDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false); -+Value *EXACT_S_DIV(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FDIV(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); -+Value *UREM(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *SREM(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FREM(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); -+Value *SHL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); -+Value *SHL(Value *LHS, const APInt &RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); -+Value *SHL(Value *LHS, uint64_t RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); -+Value *LSHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false); -+Value *LSHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false); -+Value *LSHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false); -+Value *ASHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false); -+Value *ASHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false); -+Value *ASHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false); -+Value *AND(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *AND(Value *LHS, const APInt &RHS, const Twine &Name = ""); -+Value *AND(Value *LHS, uint64_t RHS, const Twine &Name = ""); -+Value *OR(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *OR(Value *LHS, const APInt &RHS, const Twine &Name = ""); -+Value *OR(Value *LHS, uint64_t RHS, const Twine &Name = ""); -+Value *XOR(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *XOR(Value *LHS, const APInt &RHS, const Twine &Name = ""); -+Value *XOR(Value *LHS, uint64_t RHS, const Twine &Name = ""); -+Value *BINOP(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); -+Value *NEG(Value *V, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); -+Value *NSW_NEG(Value *V, const Twine &Name = ""); -+Value *NUW_NEG(Value *V, const Twine &Name = ""); -+Value *FNEG(Value *V, const Twine &Name = "", MDNode *FPMathTag = nullptr); -+Value *NOT(Value *V, const Twine &Name = ""); -+AllocaInst *ALLOCA(Type *Ty, Value *ArraySize = nullptr, const Twine &Name = ""); -+LoadInst *LOAD(Value *Ptr, const char *Name); -+LoadInst *LOAD(Value *Ptr, const Twine &Name = ""); -+LoadInst *LOAD(Value *Ptr, bool isVolatile, const Twine &Name = ""); -+StoreInst *STORE(Value *Val, Value *Ptr, bool isVolatile = false); -+LoadInst *ALIGNED_LOAD(Value *Ptr, unsigned Align, const char *Name); -+LoadInst *ALIGNED_LOAD(Value *Ptr, unsigned Align, const Twine &Name = ""); -+LoadInst *ALIGNED_LOAD(Value *Ptr, unsigned Align, bool isVolatile, const Twine &Name = ""); -+StoreInst *ALIGNED_STORE(Value *Val, Value *Ptr, unsigned Align, bool isVolatile = false); -+FenceInst *FENCE(AtomicOrdering Ordering, SynchronizationScope SynchScope = CrossThread, const Twine &Name = ""); -+AtomicCmpXchgInst *ATOMIC_CMP_XCHG(Value *Ptr, Value *Cmp, Value *New, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope = CrossThread); -+AtomicRMWInst *ATOMIC_RMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, AtomicOrdering Ordering, SynchronizationScope SynchScope = CrossThread); -+Value *GEPA(Value *Ptr, ArrayRef IdxList, const Twine &Name = ""); -+Value *IN_BOUNDS_GEP(Value *Ptr, ArrayRef IdxList, const Twine &Name = ""); -+Value *GEP(Value *Ptr, Value *Idx, const Twine &Name = ""); -+Value *IN_BOUNDS_GEP(Value *Ptr, Value *Idx, const Twine &Name = ""); -+Value *CONST_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name = ""); -+Value *CONST_IN_BOUNDS_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name = ""); -+Value *CONST_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = ""); -+Value *CONST_IN_BOUNDS_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = ""); -+Value *CONST_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = ""); -+Value *CONST_IN_BOUNDS_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = ""); -+Value *CONST_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = ""); -+Value *CONST_IN_BOUNDS_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = ""); -+Value *STRUCT_GEP(Value *Ptr, unsigned Idx, const Twine &Name = ""); -+Value *GLOBAL_STRING_PTR(StringRef Str, const Twine &Name = ""); -+Value *TRUNC(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *Z_EXT(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *S_EXT(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *Z_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *S_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *FP_TO_UI(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *FP_TO_SI(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *UI_TO_FP(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *SI_TO_FP(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *FP_TRUNC(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *FP_EXT(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *PTR_TO_INT(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *INT_TO_PTR(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *BITCAST(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *Z_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *S_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *TRUNC_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *CAST(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name = ""); -+Value *POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *POINTER_BIT_CAST_OR_ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *INT_CAST(Value *V, Type *DestTy, bool isSigned, const Twine &Name = ""); -+Value *BIT_OR_POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *FP_CAST(Value *V, Type *DestTy, const Twine &Name = ""); -+Value *ICMP_EQ(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP_NE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP_UGT(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP_UGE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP_ULT(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP_ULE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP_SGT(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP_SGE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP_SLT(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP_SLE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_OEQ(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_OGT(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_OGE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_OLT(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_OLE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_ONE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_ORD(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_UNO(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_UEQ(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_UGT(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_UGE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_ULT(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_ULE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP_UNE(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *ICMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *FCMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = ""); -+PHINode *PHI(Type *Ty, unsigned NumReservedValues, const Twine &Name = ""); -+CallInst *CALL(Value *Callee, const Twine &Name = ""); -+CallInst *CALL(Value *Callee, Value *Arg, const Twine &Name = ""); -+CallInst *CALL2(Value *Callee, Value *Arg1, Value *Arg2, const Twine &Name = ""); -+CallInst *CALL3(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name = ""); -+CallInst *CALL4(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, const Twine &Name = ""); -+CallInst *CALL5(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, Value *Arg5, const Twine &Name = ""); -+CallInst *CALLA(Value *Callee, ArrayRef Args, const Twine &Name = ""); -+Value *SELECT(Value *C, Value *True, Value *False, const Twine &Name = ""); -+VAArgInst *VA_ARG(Value *List, Type *Ty, const Twine &Name = ""); -+Value *VEXTRACT(Value *Vec, Value *Idx, const Twine &Name = ""); -+Value *VINSERT(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name = ""); -+Value *VSHUFFLE(Value *V1, Value *V2, Value *Mask, const Twine &Name = ""); -+Value *EXTRACT_VALUE(Value *Agg, ArrayRef Idxs, const Twine &Name = ""); -+Value *INSERT_VALUE(Value *Agg, Value *Val, ArrayRef Idxs, const Twine &Name = ""); -+LandingPadInst *LANDING_PAD(Type *Ty, Value *PersFn, unsigned NumClauses, const Twine &Name = ""); -+Value *IS_NULL(Value *Arg, const Twine &Name = ""); -+Value *IS_NOT_NULL(Value *Arg, const Twine &Name = ""); -+Value *PTR_DIFF(Value *LHS, Value *RHS, const Twine &Name = ""); -+Value *VECTOR_SPLAT(unsigned NumElts, Value *V, const Twine &Name = ""); -+Value *EXTRACT_INTEGER(const DataLayout &DL, Value *From, IntegerType *ExtractedTy, uint64_t Offset, const Twine &Name); -+CallInst *ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue = nullptr); -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h -new file mode 100644 -index 0000000..92867ec ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h -@@ -0,0 +1,34 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file builder_math.h -+* -+* @brief math/alu builder functions -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+ -+Value* VLOG2PS(Value* src); -+Value* VPOW24PS(Value* src); -+Value* VEXP2PS(Value* src); -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp -new file mode 100644 -index 0000000..5897121 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp -@@ -0,0 +1,1195 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file builder_misc.cpp -+* -+* @brief Implementation for miscellaneous builder functions -+* -+* Notes: -+* -+******************************************************************************/ -+#include "builder.h" -+#include "llvm/Support/DynamicLibrary.h" -+ -+void __cdecl CallPrint(const char* fmt, ...); -+ -+Constant *Builder::C(bool i) -+{ -+ return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); -+} -+ -+Constant *Builder::C(char i) -+{ -+ return ConstantInt::get(IRB()->getInt8Ty(), i); -+} -+ -+Constant *Builder::C(uint8_t i) -+{ -+ return ConstantInt::get(IRB()->getInt8Ty(), i); -+} -+ -+Constant *Builder::C(int i) -+{ -+ return ConstantInt::get(IRB()->getInt32Ty(), i); -+} -+ -+Constant *Builder::C(int64_t i) -+{ -+ return ConstantInt::get(IRB()->getInt64Ty(), i); -+} -+ -+Constant *Builder::C(UINT16 i) -+{ -+ return ConstantInt::get(mInt16Ty,i); -+} -+ -+Constant *Builder::C(uint32_t i) -+{ -+ return ConstantInt::get(IRB()->getInt32Ty(), i); -+} -+ -+Constant *Builder::C(float i) -+{ -+ return ConstantFP::get(IRB()->getFloatTy(), i); -+} -+ -+Constant *Builder::PRED(bool pred) -+{ -+ return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); -+} -+ -+Value *Builder::VIMMED1(int i) -+{ -+ return ConstantVector::getSplat(JM()->mVWidth, cast(C(i))); -+} -+ -+Value *Builder::VIMMED1(uint32_t i) -+{ -+ return ConstantVector::getSplat(JM()->mVWidth, cast(C(i))); -+} -+ -+Value *Builder::VIMMED1(float i) -+{ -+ return ConstantVector::getSplat(JM()->mVWidth, cast(C(i))); -+} -+ -+Value *Builder::VIMMED1(bool i) -+{ -+ return ConstantVector::getSplat(JM()->mVWidth, cast(C(i))); -+} -+ -+Value *Builder::VUNDEF_IPTR() -+{ -+ return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth)); -+} -+ -+Value *Builder::VUNDEF_I() -+{ -+ return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth)); -+} -+ -+Value *Builder::VUNDEF(Type *ty, uint32_t size) -+{ -+ return UndefValue::get(VectorType::get(ty, size)); -+} -+ -+Value *Builder::VUNDEF_F() -+{ -+ return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth)); -+} -+ -+Value *Builder::VUNDEF(Type* t) -+{ -+ return UndefValue::get(VectorType::get(t, JM()->mVWidth)); -+} -+ -+Value *Builder::VINSERT(Value *vec, Value *val, int index) -+{ -+ return VINSERT(vec, val, C(index)); -+} -+ -+Value *Builder::VBROADCAST(Value *src) -+{ -+ // check if src is already a vector -+ if (src->getType()->isVectorTy()) -+ { -+ return src; -+ } -+ -+ Value *vecRet = VUNDEF(src->getType()); -+ vecRet = VINSERT(vecRet, src, 0); -+ vecRet = VSHUFFLE(vecRet, vecRet, VIMMED1(0)); -+ -+ return vecRet; -+} -+ -+uint32_t Builder::IMMED(Value* v) -+{ -+ SWR_ASSERT(isa(v)); -+ ConstantInt *pValConst = cast(v); -+ return pValConst->getZExtValue(); -+} -+ -+Value *Builder::GEP(Value* ptr, const std::initializer_list &indexList) -+{ -+ std::vector indices; -+ for (auto i : indexList) -+ indices.push_back(i); -+ return GEPA(ptr, indices); -+} -+ -+Value *Builder::GEP(Value* ptr, const std::initializer_list &indexList) -+{ -+ std::vector indices; -+ for (auto i : indexList) -+ indices.push_back(C(i)); -+ return GEPA(ptr, indices); -+} -+ -+LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list &indices, const llvm::Twine& name) -+{ -+ std::vector valIndices; -+ for (auto i : indices) -+ valIndices.push_back(C(i)); -+ return LOAD(GEPA(basePtr, valIndices), name); -+} -+ -+LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list &indices, const llvm::Twine& name) -+{ -+ std::vector valIndices; -+ for (auto i : indices) -+ valIndices.push_back(i); -+ return LOAD(GEPA(basePtr, valIndices), name); -+} -+ -+StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list &indices) -+{ -+ std::vector valIndices; -+ for (auto i : indices) -+ valIndices.push_back(C(i)); -+ return STORE(val, GEPA(basePtr, valIndices)); -+} -+ -+StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list &indices) -+{ -+ std::vector valIndices; -+ for (auto i : indices) -+ valIndices.push_back(i); -+ return STORE(val, GEPA(basePtr, valIndices)); -+} -+ -+CallInst *Builder::CALL(Value *Callee, const std::initializer_list &argsList) -+{ -+ std::vector args; -+ for (auto arg : argsList) -+ args.push_back(arg); -+ return CALLA(Callee, args); -+} -+ -+Value *Builder::VRCP(Value *va) -+{ -+ return FDIV(VIMMED1(1.0f), va); // 1 / a -+} -+ -+Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY) -+{ -+ Value* vOut = FMADDPS(vA, vX, vC); -+ vOut = FMADDPS(vB, vY, vOut); -+ return vOut; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate an i32 masked load operation in LLVM IR. If not -+/// supported on the underlying platform, emulate it with float masked load -+/// @param src - base address pointer for the load -+/// @param vMask - SIMD wide mask that controls whether to access memory load 0 -+Value *Builder::MASKLOADD(Value* src,Value* mask) -+{ -+ Value* vResult; -+ // use avx2 gather instruction is available -+ if(JM()->mArch.AVX2()) -+ { -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); -+ vResult = CALL2(func,src,mask); -+ } -+ else -+ { -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); -+ Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth)); -+ vResult = BITCAST(CALL2(func,src,fMask), VectorType::get(mInt32Ty,JM()->mVWidth)); -+ } -+ return vResult; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief insert a JIT call to CallPrint -+/// - outputs formatted string to both stdout and VS output window -+/// - DEBUG builds only -+/// Usage example: -+/// PRINT("index %d = 0x%p\n",{C(lane), pIndex}); -+/// where C(lane) creates a constant value to print, and pIndex is the Value* -+/// result from a GEP, printing out the pointer to memory -+/// @param printStr - constant string to print, which includes format specifiers -+/// @param printArgs - initializer list of Value*'s to print to std out -+CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list &printArgs) -+{ -+#if defined( DEBUG ) || defined( _DEBUG ) -+ // push the arguments to CallPrint into a vector -+ std::vector printCallArgs; -+ // save room for the format string. we still need to modify it for vectors -+ printCallArgs.resize(1); -+ -+ // search through the format string for special processing -+ size_t pos = 0; -+ std::string tempStr(printStr); -+ pos = tempStr.find('%', pos); -+ auto v = printArgs.begin(); -+ // printing is slow. now it's slower... -+ while((pos != std::string::npos) && (v != printArgs.end())) -+ { -+ // for %f we need to cast float Values to doubles so that they print out correctly -+ if((tempStr[pos+1]=='f') && ((*v)->getType()->isFloatTy())) -+ { -+ printCallArgs.push_back(FP_EXT(*v, Type::getDoubleTy(JM()->mContext))); -+ pos++; -+ } -+ // add special handling for %f and %d format specifiers to make printing llvm vector types easier -+ else if((*v)->getType()->isVectorTy()) -+ { -+ if((tempStr[pos+1]=='f') && ((*v)->getType()->getContainedType(0)->isFloatTy())) -+ { -+ uint32_t i = 0; -+ for( ; i < ((*v)->getType()->getVectorNumElements())-1; i++) -+ { -+ tempStr.insert(pos, std::string("%f ")); -+ pos+=3; -+ printCallArgs.push_back(FP_EXT(VEXTRACT(*v, C(i)), Type::getDoubleTy(JM()->mContext))); -+ } -+ printCallArgs.push_back(FP_EXT(VEXTRACT(*v,C(i)),Type::getDoubleTy(JM()->mContext))); -+ } -+ else if((tempStr[pos+1]=='d') && ((*v)->getType()->getContainedType(0)->isIntegerTy())) -+ { -+ uint32_t i = 0; -+ for( ; i < ((*v)->getType()->getVectorNumElements())-1; i++) -+ { -+ tempStr.insert(pos,std::string("%d ")); -+ pos += 3; -+ printCallArgs.push_back(VEXTRACT(*v,C(i))); -+ } -+ printCallArgs.push_back(VEXTRACT(*v,C(i))); -+ } -+ else -+ { -+ /// not a supported vector to print -+ /// @todo pointer types too -+ SWR_ASSERT(0); -+ } -+ } -+ else -+ { -+ printCallArgs.push_back(*v); -+ } -+ -+ // advance to the next arguement -+ v++; -+ pos = tempStr.find('%', ++pos); -+ } -+ -+ // create global variable constant string -+ Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true); -+ GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr"); -+ JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); -+ -+ // get a pointer to the first character in the constant string array -+ std::vector geplist{C(0),C(0)}; -+ Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false); -+ -+ // insert the pointer to the format string in the argument vector -+ printCallArgs[0] = strGEP; -+ -+ // get pointer to CallPrint function and insert decl into the module if needed -+ std::vector args; -+ args.push_back(PointerType::get(mInt8Ty,0)); -+ FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true); -+ Function *callPrintFn = cast(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); -+ -+ // if we haven't yet added the symbol to the symbol table -+ if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) -+ { -+ sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint); -+ } -+ -+ // insert a call to CallPrint -+ return CALLA(callPrintFn,printCallArgs); -+#else // #if defined( DEBUG ) || defined( _DEBUG ) -+ return nullptr; -+#endif -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate a masked gather operation in LLVM IR. If not -+/// supported on the underlying platform, emulate it with loads -+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid -+/// @param pBase - Int8* base VB address pointer value -+/// @param vIndices - SIMD wide value of VB byte offsets -+/// @param vMask - SIMD wide mask that controls whether to access memory or the src values -+/// @param scale - value to scale indices by -+Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) -+{ -+ Value* vGather; -+ -+ // use avx2 gather instruction if available -+ if(JM()->mArch.AVX2()) -+ { -+ // force mask to , required by vgather -+ vMask = BITCAST(vMask, mSimdFP32Ty); -+ vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale); -+ } -+ else -+ { -+ Value* pStack = STACKSAVE(); -+ -+ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address -+ Value* vSrcPtr = ALLOCA(vSrc->getType()); -+ STORE(vSrc, vSrcPtr); -+ -+ vGather = VUNDEF_F(); -+ Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); -+ Value *vOffsets = MUL(vIndices,vScaleVec); -+ Value *mask = MASK(vMask); -+ for(uint32_t i = 0; i < JM()->mVWidth; ++i) -+ { -+ // single component byte index -+ Value *offset = VEXTRACT(vOffsets,C(i)); -+ // byte pointer to component -+ Value *loadAddress = GEP(pBase,offset); -+ loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0)); -+ // pointer to the value to load if we're masking off a component -+ Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); -+ Value *selMask = VEXTRACT(mask,C(i)); -+ // switch in a safe address to load if we're trying to access a vertex -+ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); -+ Value *val = LOAD(validAddress); -+ vGather = VINSERT(vGather,val,C(i)); -+ } -+ STACKRESTORE(pStack); -+ } -+ -+ return vGather; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate a masked gather operation in LLVM IR. If not -+/// supported on the underlying platform, emulate it with loads -+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid -+/// @param pBase - Int8* base VB address pointer value -+/// @param vIndices - SIMD wide value of VB byte offsets -+/// @param vMask - SIMD wide mask that controls whether to access memory or the src values -+/// @param scale - value to scale indices by -+Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) -+{ -+ Value* vGather; -+ -+ // use avx2 gather instruction if available -+ if(JM()->mArch.AVX2()) -+ { -+ vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale); -+ } -+ else -+ { -+ Value* pStack = STACKSAVE(); -+ -+ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address -+ Value* vSrcPtr = ALLOCA(vSrc->getType()); -+ STORE(vSrc, vSrcPtr); -+ -+ vGather = VUNDEF_I(); -+ Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); -+ Value *vOffsets = MUL(vIndices, vScaleVec); -+ Value *mask = MASK(vMask); -+ for(uint32_t i = 0; i < JM()->mVWidth; ++i) -+ { -+ // single component byte index -+ Value *offset = VEXTRACT(vOffsets, C(i)); -+ // byte pointer to component -+ Value *loadAddress = GEP(pBase, offset); -+ loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0)); -+ // pointer to the value to load if we're masking off a component -+ Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)}); -+ Value *selMask = VEXTRACT(mask, C(i)); -+ // switch in a safe address to load if we're trying to access a vertex -+ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); -+ Value *val = LOAD(validAddress, C(0)); -+ vGather = VINSERT(vGather, val, C(i)); -+ } -+ -+ STACKRESTORE(pStack); -+ } -+ return vGather; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief convert x86 mask to llvm mask -+Value* Builder::MASK(Value* vmask) -+{ -+ Value* src = BITCAST(vmask, mSimdInt32Ty); -+ return ICMP_SLT(src, VIMMED1(0)); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief convert llvm mask to x86 mask -+Value* Builder::VMASK(Value* mask) -+{ -+ return S_EXT(mask, mSimdInt32Ty); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate a VPSHUFB operation in LLVM IR. If not -+/// supported on the underlying platform, emulate it -+/// @param a - 256bit SIMD(32x8bit) of 8bit integer values -+/// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values -+/// Byte masks in lower 128 lane of b selects 8 bit values from lower -+/// 128bits of a, and vice versa for the upper lanes. If the mask -+/// value is negative, '0' is inserted. -+Value *Builder::PSHUFB(Value* a, Value* b) -+{ -+ Value* res; -+ // use avx2 pshufb instruction if available -+ if(JM()->mArch.AVX2()) -+ { -+ res = VPSHUFB(a, b); -+ } -+ else -+ { -+ Constant* cB = dyn_cast(b); -+ // number of 8 bit elements in b -+ uint32_t numElms = cast(cB->getType())->getNumElements(); -+ // output vector -+ Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms)); -+ -+ // insert an 8 bit value from the high and low lanes of a per loop iteration -+ numElms /= 2; -+ for(uint32_t i = 0; i < numElms; i++) -+ { -+ ConstantInt* cLow128b = cast(cB->getAggregateElement(i)); -+ ConstantInt* cHigh128b = cast(cB->getAggregateElement(i + numElms)); -+ -+ // extract values from constant mask -+ char valLow128bLane = (char)(cLow128b->getSExtValue()); -+ char valHigh128bLane = (char)(cHigh128b->getSExtValue()); -+ -+ Value* insertValLow128b; -+ Value* insertValHigh128b; -+ -+ // if the mask value is negative, insert a '0' in the respective output position -+ // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector -+ insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); -+ insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); -+ -+ vShuf = VINSERT(vShuf, insertValLow128b, i); -+ vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); -+ } -+ res = vShuf; -+ } -+ return res; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 -+/// bits)in LLVM IR. If not supported on the underlying platform, emulate it -+/// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only -+/// lower 8 values are used. -+Value *Builder::PMOVSXBD(Value* a) -+{ -+ Value* res; -+ // use avx2 byte sign extend instruction if available -+ if(JM()->mArch.AVX2()) -+ { -+ res = VPMOVSXBD(a); -+ } -+ else -+ { -+ // VPMOVSXBD output type -+ Type* v8x32Ty = VectorType::get(mInt32Ty, 8); -+ // Extract 8 values from 128bit lane and sign extend -+ res = S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); -+ } -+ return res; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 -+/// bits)in LLVM IR. If not supported on the underlying platform, emulate it -+/// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. -+Value *Builder::PMOVSXWD(Value* a) -+{ -+ Value* res; -+ // use avx2 word sign extend if available -+ if(JM()->mArch.AVX2()) -+ { -+ res = VPMOVSXWD(a); -+ } -+ else -+ { -+ // VPMOVSXWD output type -+ Type* v8x32Ty = VectorType::get(mInt32Ty, 8); -+ // Extract 8 values from 128bit lane and sign extend -+ res = S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); -+ } -+ return res; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate a VPERMD operation (shuffle 32 bit integer values -+/// across 128 bit lanes) in LLVM IR. If not supported on the underlying -+/// platform, emulate it -+/// @param a - 256bit SIMD lane(8x32bit) of integer values. -+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values -+Value *Builder::PERMD(Value* a, Value* idx) -+{ -+ Value* res; -+ // use avx2 permute instruction if available -+ if(JM()->mArch.AVX2()) -+ { -+ // llvm 3.6.0 swapped the order of the args to vpermd -+ res = VPERMD(idx, a); -+ } -+ else -+ { -+ res = VSHUFFLE(a, a, idx); -+ } -+ return res; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) -+/// in LLVM IR. If not supported on the underlying platform, emulate it -+/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. -+Value *Builder::CVTPH2PS(Value* a) -+{ -+ if (JM()->mArch.F16C()) -+ { -+ return VCVTPH2PS(a); -+ } -+ else -+ { -+ Value* vExt = S_EXT(a, mSimdInt32Ty); -+ Value* sign = AND(vExt,0x80000000); -+ -+ // normal case -+ Value* mantissa = SHL(AND(vExt, 0x03ff), 13); -+ Value* exponent = AND(vExt, 0x7c00); -+ exponent = ADD(exponent, VIMMED1(0x1c000)); -+ exponent = SHL(exponent, 13); -+ -+ Value* result = OR(OR(sign, mantissa), exponent); -+ -+ // handle 0 -+ Value* zeroMask = ICMP_EQ(AND(vExt, 0x7fff), VIMMED1(0)); -+ result = SELECT(zeroMask, sign, result); -+ -+ // handle infinity -+ Value* infMask = ICMP_EQ(AND(vExt, 0x7c00), VIMMED1(0x7c00)); -+ Value* signedInf = OR(VIMMED1(0x7f800000), sign); -+ result = SELECT(infMask, signedInf, result); -+ -+ // @todo handle subnormal -+ -+ // cast to f32 -+ result = BITCAST(result, mSimdFP32Ty); -+ return result; -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) -+/// in LLVM IR. If not supported on the underlying platform, emulate it -+/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. -+Value *Builder::CVTPS2PH(Value* a, Value* rounding) -+{ -+ if (JM()->mArch.F16C()) -+ { -+ return VCVTPS2PH(a, rounding); -+ } -+ else -+ { -+ SWR_ASSERT(false, "Emulation of VCVTPH2PS unimplemented."); -+ return nullptr; -+ } -+} -+ -+Value *Builder::PMAXSD(Value* a, Value* b) -+{ -+ if (JM()->mArch.AVX2()) -+ { -+ return VPMAXSD(a, b); -+ } -+ else -+ { -+ // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources -+ Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd); -+ -+ // low 128 -+ Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); -+ Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); -+ Value* resLo = CALL2(pmaxsd, aLo, bLo); -+ -+ // high 128 -+ Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); -+ Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); -+ Value* resHi = CALL2(pmaxsd, aHi, bHi); -+ -+ // combine -+ Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); -+ result = VINSERTI128(result, resHi, C((uint8_t)1)); -+ -+ return result; -+ } -+} -+ -+Value *Builder::PMINSD(Value* a, Value* b) -+{ -+ if (JM()->mArch.AVX2()) -+ { -+ return VPMINSD(a, b); -+ } -+ else -+ { -+ // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources -+ Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd); -+ -+ // low 128 -+ Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); -+ Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); -+ Value* resLo = CALL2(pminsd, aLo, bLo); -+ -+ // high 128 -+ Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); -+ Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); -+ Value* resHi = CALL2(pminsd, aHi, bHi); -+ -+ // combine -+ Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); -+ result = VINSERTI128(result, resHi, C((uint8_t)1)); -+ -+ return result; -+ } -+} -+ -+void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, -+ Value* mask, Value* vGatherComponents[], bool bPackedOutput) -+{ -+ const SWR_FORMAT_INFO &info = GetFormatInfo(format); -+ if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) -+ { -+ // ensure our mask is the correct type -+ mask = BITCAST(mask, mSimdFP32Ty); -+ GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); -+ } -+ else -+ { -+ // ensure our mask is the correct type -+ mask = BITCAST(mask, mSimdInt32Ty); -+ GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); -+ } -+} -+ -+void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, -+ Value* mask, Value* vGatherComponents[], bool bPackedOutput) -+{ -+ switch(info.bpp / info.numComps) -+ { -+ case 16: -+ { -+ Value* vGatherResult[2]; -+ Value *vMask; -+ -+ // TODO: vGatherMaskedVal -+ Value* vGatherMaskedVal = VIMMED1((float)0); -+ -+ // always have at least one component out of x or y to fetch -+ -+ // save mask as it is zero'd out after each gather -+ vMask = mask; -+ -+ vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); -+ // e.g. result of first 8x32bit integer gather for 16bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy -+ // -+ -+ // if we have at least one component out of x or y to fetch -+ if(info.numComps > 2) -+ { -+ // offset base to the next components(zw) in the vertex to gather -+ pSrcBase = GEP(pSrcBase, C((char)4)); -+ vMask = mask; -+ -+ vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); -+ // e.g. result of second 8x32bit integer gather for 16bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw -+ // -+ } -+ else -+ { -+ vGatherResult[1] = vGatherMaskedVal; -+ } -+ -+ // Shuffle gathered components into place, each row is a component -+ Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); -+ } -+ break; -+ case 32: -+ { -+ // apply defaults -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); -+ } -+ -+ for(uint32_t i = 0; i < info.numComps; i++) -+ { -+ uint32_t swizzleIndex = info.swizzle[i]; -+ -+ // save mask as it is zero'd out after each gather -+ Value *vMask = mask; -+ -+ // Gather a SIMD of components -+ vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); -+ -+ // offset base to the next component to gather -+ pSrcBase = GEP(pSrcBase, C((char)4)); -+ } -+ } -+ break; -+ default: -+ SWR_ASSERT(0, "Invalid float format"); -+ break; -+ } -+} -+ -+void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, -+ Value* mask, Value* vGatherComponents[], bool bPackedOutput) -+{ -+ switch (info.bpp / info.numComps) -+ { -+ case 8: -+ { -+ Value* vGatherMaskedVal = VIMMED1((int32_t)0); -+ Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1)); -+ // e.g. result of an 8x32bit integer gather for 8bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw -+ -+ Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); -+ } -+ break; -+ case 16: -+ { -+ Value* vGatherResult[2]; -+ Value *vMask; -+ -+ // TODO: vGatherMaskedVal -+ Value* vGatherMaskedVal = VIMMED1((int32_t)0); -+ -+ // always have at least one component out of x or y to fetch -+ -+ // save mask as it is zero'd out after each gather -+ vMask = mask; -+ -+ vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); -+ // e.g. result of first 8x32bit integer gather for 16bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy -+ // -+ -+ // if we have at least one component out of x or y to fetch -+ if(info.numComps > 2) -+ { -+ // offset base to the next components(zw) in the vertex to gather -+ pSrcBase = GEP(pSrcBase, C((char)4)); -+ vMask = mask; -+ -+ vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); -+ // e.g. result of second 8x32bit integer gather for 16bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw -+ // -+ } -+ else -+ { -+ vGatherResult[1] = vGatherMaskedVal; -+ } -+ -+ // Shuffle gathered components into place, each row is a component -+ Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); -+ -+ } -+ break; -+ case 32: -+ { -+ // apply defaults -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ vGatherComponents[i] = VIMMED1((int)info.defaults[i]); -+ } -+ -+ for(uint32_t i = 0; i < info.numComps; i++) -+ { -+ uint32_t swizzleIndex = info.swizzle[i]; -+ -+ // save mask as it is zero'd out after each gather -+ Value *vMask = mask; -+ -+ // Gather a SIMD of components -+ vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); -+ -+ // offset base to the next component to gather -+ pSrcBase = GEP(pSrcBase, C((char)4)); -+ } -+ } -+ break; -+ default: -+ SWR_ASSERT(0, "unsupported format"); -+ break; -+ } -+} -+ -+void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) -+{ -+ // cast types -+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); -+ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits -+ -+ // input could either be float or int vector; do shuffle work in int -+ vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); -+ vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); -+ -+ if(bPackedOutput) -+ { -+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits -+ -+ // shuffle mask -+ Value* vConstMask = C({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, -+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); -+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); -+ // after pshufb: group components together in each 128bit lane -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy -+ -+ Value* vi128XY = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); -+ // after PERMD: move and pack xy components into each 128bit lane -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy -+ -+ // do the same for zw components -+ Value* vi128ZW = nullptr; -+ if(info.numComps > 2) -+ { -+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); -+ vi128ZW = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); -+ } -+ -+ for(uint32_t i = 0; i < 4; i++) -+ { -+ uint32_t swizzleIndex = info.swizzle[i]; -+ // todo: fixed for packed -+ Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); -+ if(i >= info.numComps) -+ { -+ // set the default component val -+ vGatherOutput[swizzleIndex] = vGatherMaskedVal; -+ continue; -+ } -+ -+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 -+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; -+ // if x or y, use vi128XY permute result, else use vi128ZW -+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; -+ -+ // extract packed component 128 bit lanes -+ vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); -+ } -+ -+ } -+ else -+ { -+ // pshufb masks for each component -+ Value* vConstMask[2]; -+ // x/z shuffle mask -+ vConstMask[0] = C({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, -+ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); -+ -+ // y/w shuffle mask -+ vConstMask[1] = C({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, -+ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); -+ -+ -+ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits -+ // apply defaults -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); -+ } -+ -+ for(uint32_t i = 0; i < info.numComps; i++) -+ { -+ uint32_t swizzleIndex = info.swizzle[i]; -+ -+ // select correct constMask for x/z or y/w pshufb -+ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; -+ // if x or y, use vi128XY permute result, else use vi128ZW -+ uint32_t selectedGather = (i < 2) ? 0 : 1; -+ -+ vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); -+ // after pshufb mask for x channel; z uses the same shuffle from the second gather -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 -+ } -+ } -+} -+ -+void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) -+{ -+ // cast types -+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); -+ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits -+ -+ if(bPackedOutput) -+ { -+ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits -+ // shuffle mask -+ Value* vConstMask = C({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, -+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); -+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); -+ // after pshufb: group components together in each 128bit lane -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww -+ -+ Value* vi128XY = BITCAST(PERMD(vShufResult, C({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); -+ // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) -+ -+ // do the same for zw components -+ Value* vi128ZW = nullptr; -+ if(info.numComps > 2) -+ { -+ vi128ZW = BITCAST(PERMD(vShufResult, C({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); -+ } -+ -+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex -+ for(uint32_t i = 0; i < 4; i++) -+ { -+ uint32_t swizzleIndex = info.swizzle[i]; -+ // todo: fix for packed -+ Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); -+ if(i >= info.numComps) -+ { -+ // set the default component val -+ vGatherOutput[swizzleIndex] = vGatherMaskedVal; -+ continue; -+ } -+ -+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 -+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; -+ // if x or y, use vi128XY permute result, else use vi128ZW -+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; -+ -+ // sign extend -+ vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); -+ } -+ } -+ // else zero extend -+ else{ -+ // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits -+ // apply defaults -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); -+ } -+ -+ for(uint32_t i = 0; i < info.numComps; i++){ -+ uint32_t swizzleIndex = info.swizzle[i]; -+ -+ // pshufb masks for each component -+ Value* vConstMask; -+ switch(i) -+ { -+ case 0: -+ // x shuffle mask -+ vConstMask = C({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, -+ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); -+ break; -+ case 1: -+ // y shuffle mask -+ vConstMask = C({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, -+ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); -+ break; -+ case 2: -+ // z shuffle mask -+ vConstMask = C({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, -+ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); -+ break; -+ case 3: -+ // w shuffle mask -+ vConstMask = C({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, -+ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); -+ break; -+ default: -+ vConstMask = nullptr; -+ break; -+ } -+ -+ vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); -+ // after pshufb for x channel -+ // 256i - 0 1 2 3 4 5 6 7 -+ // x000 x000 x000 x000 x000 x000 x000 x000 -+ } -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief emulates a scatter operation. -+/// @param pDst - pointer to destination -+/// @param vSrc - vector of src data to scatter -+/// @param vOffsets - vector of byte offsets from pDst -+/// @param vMask - mask of valid lanes -+void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) -+{ -+ Value* pStack = STACKSAVE(); -+ -+ // allocate tmp stack for masked off lanes -+ Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType()); -+ -+ Value *mask = MASK(vMask); -+ for (uint32_t i = 0; i < JM()->mVWidth; ++i) -+ { -+ Value *offset = VEXTRACT(vOffsets, C(i)); -+ // byte pointer to component -+ Value *storeAddress = GEP(pDst, offset); -+ storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0)); -+ Value *selMask = VEXTRACT(mask, C(i)); -+ Value *srcElem = VEXTRACT(vSrc, C(i)); -+ // switch in a safe address to load if we're trying to access a vertex -+ Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr); -+ STORE(srcElem, validAddress); -+ } -+ -+ STACKRESTORE(pStack); -+} -+ -+Value* Builder::VABSPS(Value* a) -+{ -+ Value* asInt = BITCAST(a, mSimdInt32Ty); -+ Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); -+ return result; -+} -+ -+Value *Builder::ICLAMP(Value* src, Value* low, Value* high) -+{ -+ Value *lowCmp = ICMP_SLT(src, low); -+ Value *ret = SELECT(lowCmp, low, src); -+ -+ Value *highCmp = ICMP_SGT(ret, high); -+ ret = SELECT(highCmp, high, ret); -+ -+ return ret; -+} -+ -+Value *Builder::FCLAMP(Value* src, Value* low, Value* high) -+{ -+ Value *lowCmp = FCMP_OLT(src, low); -+ Value *ret = SELECT(lowCmp, low, src); -+ -+ Value *highCmp = FCMP_OGT(ret, high); -+ ret = SELECT(highCmp, high, ret); -+ -+ return ret; -+} -+ -+Value *Builder::FCLAMP(Value* src, float low, float high) -+{ -+ Value* result = VMAXPS(src, VIMMED1(low)); -+ result = VMINPS(result, VIMMED1(high)); -+ -+ return result; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief save/restore stack, providing ability to push/pop the stack and -+/// reduce overall stack requirements for temporary stack use -+Value* Builder::STACKSAVE() -+{ -+ Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); -+ return CALL(pfnStackSave); -+} -+ -+void Builder::STACKRESTORE(Value* pSaved) -+{ -+ Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore); -+ CALL(pfnStackRestore, pSaved); -+} -+ -+Value *Builder::FMADDPS(Value* a, Value* b, Value* c) -+{ -+ Value* vOut; -+ // use FMADs if available -+ if(JM()->mArch.AVX2()) -+ { -+ vOut = VFMADDPS(a, b, c); -+ } -+ else -+ { -+ vOut = FADD(FMUL(a, b), c); -+ } -+ return vOut; -+} -+ -+Value* Builder::POPCNT(Value* a) -+{ -+ Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() }); -+ return CALL(pCtPop, a); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief C functions called by LLVM IR -+////////////////////////////////////////////////////////////////////////// -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief called in JIT code, inserted by PRINT -+/// output to both stdout and visual studio debug console -+void __cdecl CallPrint(const char* fmt, ...) -+{ -+#if defined( DEBUG ) || defined( _DEBUG ) -+ va_list args; -+ va_start(args, fmt); -+ vprintf(fmt, args); -+ -+#if defined( _WIN32 ) -+ char strBuf[1024]; -+ vsnprintf_s(strBuf, _TRUNCATE, fmt, args); -+ OutputDebugString(strBuf); -+#endif -+#endif // #if defined( DEBUG ) || defined( _DEBUG ) -+} -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h -new file mode 100644 -index 0000000..8a32c6a ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h -@@ -0,0 +1,141 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file builder_misc.h -+* -+* @brief miscellaneous builder functions -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+ -+Constant *C(bool i); -+Constant *C(char i); -+Constant *C(uint8_t i); -+Constant *C(int i); -+Constant *C(int64_t i); -+Constant *C(UINT16 i); -+Constant *C(uint32_t i); -+Constant *C(float i); -+ -+template -+Constant *C(const std::initializer_list &constList) -+{ -+ std::vector vConsts; -+ for(auto i : constList) { -+ -+ vConsts.push_back(C((Ty)i)); -+ } -+ return ConstantVector::get(vConsts); -+} -+ -+Constant *PRED(bool pred); -+Value *VIMMED1(int i); -+Value *VIMMED1(uint32_t i); -+Value *VIMMED1(float i); -+Value *VIMMED1(bool i); -+Value *VUNDEF(Type* t); -+Value *VUNDEF_F(); -+Value *VUNDEF_I(); -+Value *VUNDEF(Type* ty, uint32_t size); -+Value *VUNDEF_IPTR(); -+Value *VINSERT(Value *vec, Value *val, int index); -+Value *VBROADCAST(Value *src); -+Value *VRCP(Value *va); -+Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY); -+ -+uint32_t IMMED(Value* i); -+ -+Value *GEP(Value* ptr, const std::initializer_list &indexList); -+Value *GEP(Value* ptr, const std::initializer_list &indexList); -+CallInst *CALL(Value *Callee, const std::initializer_list &args); -+ -+LoadInst *LOAD(Value *BasePtr, const std::initializer_list &offset, const llvm::Twine& name = ""); -+LoadInst *LOADV(Value *BasePtr, const std::initializer_list &offset, const llvm::Twine& name = ""); -+StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list &offset); -+StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list &offset); -+ -+Value *VCMPPS_EQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); } -+Value *VCMPPS_LT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); } -+Value *VCMPPS_LE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); } -+Value *VCMPPS_ISNAN(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_UNORD_Q)); } -+Value *VCMPPS_NEQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_NEQ_OQ)); } -+Value *VCMPPS_GE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GE_OQ)); } -+Value *VCMPPS_GT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GT_OQ)); } -+Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, C((uint8_t)_CMP_ORD_Q)); } -+ -+Value *MASK(Value* vmask); -+Value *VMASK(Value* mask); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief functions that build IR to call x86 intrinsics directly, or -+/// emulate them with other instructions if not available on the host -+////////////////////////////////////////////////////////////////////////// -+Value *MASKLOADD(Value* src, Value* mask); -+ -+void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, -+ Value* mask, Value* vGatherComponents[], bool bPackedOutput); -+ -+Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); -+void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, -+ Value* mask, Value* vGatherComponents[], bool bPackedOutput); -+ -+Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); -+void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, -+ Value* mask, Value* vGatherComponents[], bool bPackedOutput); -+ -+void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask); -+ -+void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput); -+void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput); -+ -+Value *PSHUFB(Value* a, Value* b); -+Value *PMOVSXBD(Value* a); -+Value *PMOVSXWD(Value* a); -+Value *PERMD(Value* a, Value* idx); -+Value *CVTPH2PS(Value* a); -+Value *CVTPS2PH(Value* a, Value* rounding); -+Value *PMAXSD(Value* a, Value* b); -+Value *PMINSD(Value* a, Value* b); -+Value *VABSPS(Value* a); -+Value *FMADDPS(Value* a, Value* b, Value* c); -+ -+// LLVM removed VPCMPGTD x86 intrinsic. This emulates that behavior -+Value *VPCMPGTD(Value* a, Value* b) -+{ -+ Value* vIndexMask = ICMP_UGT(a,b); -+ -+ // need to set the high bit for x86 intrinsic masks -+ return S_EXT(vIndexMask,VectorType::get(mInt32Ty,JM()->mVWidth)); -+} -+ -+Value *ICLAMP(Value* src, Value* low, Value* high); -+Value *FCLAMP(Value* src, Value* low, Value* high); -+Value *FCLAMP(Value* src, float low, float high); -+ -+CallInst *PRINT(const std::string &printStr,const std::initializer_list &printArgs); -+Value* STACKSAVE(); -+void STACKRESTORE(Value* pSaved); -+ -+Value* POPCNT(Value* a); -+ -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp -new file mode 100644 -index 0000000..b4ae075 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp -@@ -0,0 +1,242 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file builder_x86.cpp -+* -+* @brief auto-generated file -+* -+* DO NOT EDIT -+* -+******************************************************************************/ -+ -+#include "builder.h" -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VGATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256); -+ return IRB()->CreateCall5(func, src, pBase, indices, mask, scale); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VGATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256); -+ return IRB()->CreateCall5(func, src, pBase, indices, mask, scale); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VSQRTPS(Value* a) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_sqrt_ps_256); -+ return IRB()->CreateCall(func, a); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VRSQRTPS(Value* a) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_rsqrt_ps_256); -+ return IRB()->CreateCall(func, a); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VRCPPS(Value* a) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_rcp_ps_256); -+ return IRB()->CreateCall(func, a); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VMINPS(Value* a, Value* b) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_min_ps_256); -+ return IRB()->CreateCall2(func, a, b); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VMAXPS(Value* a, Value* b) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_max_ps_256); -+ return IRB()->CreateCall2(func, a, b); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VPMINSD(Value* a, Value* b) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d); -+ return IRB()->CreateCall2(func, a, b); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VPMAXSD(Value* a, Value* b) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d); -+ return IRB()->CreateCall2(func, a, b); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VROUND(Value* a, Value* rounding) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256); -+ return IRB()->CreateCall2(func, a, rounding); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VCMPPS(Value* a, Value* b, Value* cmpop) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_cmp_ps_256); -+ return IRB()->CreateCall3(func, a, b, cmpop); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VBLENDVPS(Value* a, Value* b, Value* mask) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_blendv_ps_256); -+ return IRB()->CreateCall3(func, a, b, mask); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::BEXTR_32(Value* src, Value* control) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_bmi_bextr_32); -+ return IRB()->CreateCall2(func, src, control); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VMASKLOADD(Value* src, Value* mask) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); -+ return IRB()->CreateCall2(func, src, mask); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VMASKMOVPS(Value* src, Value* mask) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256); -+ return IRB()->CreateCall2(func, src, mask); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VPSHUFB(Value* a, Value* b) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pshuf_b); -+ return IRB()->CreateCall2(func, a, b); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VPMOVSXBD(Value* a) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd); -+ return IRB()->CreateCall(func, a); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VPMOVSXWD(Value* a) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd); -+ return IRB()->CreateCall(func, a); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VPERMD(Value* idx, Value* a) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_permd); -+ return IRB()->CreateCall2(func, idx, a); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VCVTPH2PS(Value* a) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtph2ps_256); -+ return IRB()->CreateCall(func, a); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VCVTPS2PH(Value* a, Value* round) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtps2ph_256); -+ return IRB()->CreateCall2(func, a, round); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VEXTRACTF128(Value* a, Value* imm8) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vextractf128_ps_256); -+ return IRB()->CreateCall2(func, a, imm8); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VEXTRACTI128(Value* a, Value* imm8) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vextractf128_si_256); -+ return IRB()->CreateCall2(func, a, imm8); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VINSERTF128(Value* a, Value* b, Value* imm8) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vinsertf128_ps_256); -+ return IRB()->CreateCall3(func, a, b, imm8); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VINSERTI128(Value* a, Value* b, Value* imm8) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vinsertf128_si_256); -+ return IRB()->CreateCall3(func, a, b, imm8); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VHSUBPS(Value* a, Value* b) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256); -+ return IRB()->CreateCall2(func, a, b); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VPTESTC(Value* a, Value* b) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_ptestc_256); -+ return IRB()->CreateCall2(func, a, b); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VFMADDPS(Value* a, Value* b, Value* c) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_fma_vfmadd_ps_256); -+ return IRB()->CreateCall3(func, a, b, c); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VCVTTPS2DQ(Value* a) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_cvtt_ps2dq_256); -+ return IRB()->CreateCall(func, a); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+Value *Builder::VMOVMSKPS(Value* a) -+{ -+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_movmsk_ps_256); -+ return IRB()->CreateCall(func, a); -+} -+ -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h -new file mode 100644 -index 0000000..bdaabca ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h -@@ -0,0 +1,65 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file builder_x86.h -+* -+* @brief auto-generated file -+* -+* DO NOT EDIT -+* -+******************************************************************************/ -+ -+#pragma once -+ -+////////////////////////////////////////////////////////////////////////// -+/// Auto-generated x86 intrinsics -+////////////////////////////////////////////////////////////////////////// -+Value *VGATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); -+Value *VGATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); -+Value *VSQRTPS(Value* a); -+Value *VRSQRTPS(Value* a); -+Value *VRCPPS(Value* a); -+Value *VMINPS(Value* a, Value* b); -+Value *VMAXPS(Value* a, Value* b); -+Value *VPMINSD(Value* a, Value* b); -+Value *VPMAXSD(Value* a, Value* b); -+Value *VROUND(Value* a, Value* rounding); -+Value *VCMPPS(Value* a, Value* b, Value* cmpop); -+Value *VBLENDVPS(Value* a, Value* b, Value* mask); -+Value *BEXTR_32(Value* src, Value* control); -+Value *VMASKLOADD(Value* src, Value* mask); -+Value *VMASKMOVPS(Value* src, Value* mask); -+Value *VPSHUFB(Value* a, Value* b); -+Value *VPMOVSXBD(Value* a); -+Value *VPMOVSXWD(Value* a); -+Value *VPERMD(Value* idx, Value* a); -+Value *VCVTPH2PS(Value* a); -+Value *VCVTPS2PH(Value* a, Value* round); -+Value *VEXTRACTF128(Value* a, Value* imm8); -+Value *VEXTRACTI128(Value* a, Value* imm8); -+Value *VINSERTF128(Value* a, Value* b, Value* imm8); -+Value *VINSERTI128(Value* a, Value* b, Value* imm8); -+Value *VHSUBPS(Value* a, Value* b); -+Value *VPTESTC(Value* a, Value* b); -+Value *VFMADDPS(Value* a, Value* b, Value* c); -+Value *VCVTTPS2DQ(Value* a); -+Value *VMOVMSKPS(Value* a); -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp -new file mode 100644 -index 0000000..1b87769 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp -@@ -0,0 +1,1450 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file fetch_jit.cpp -+* -+* @brief Implementation of the fetch jitter -+* -+* Notes: -+* -+******************************************************************************/ -+#include "jit_api.h" -+#include "fetch_jit.h" -+#include "builder.h" -+#include "state_llvm.h" -+#include "common/containers.hpp" -+#include "llvm/IR/DataLayout.h" -+#include -+#include -+ -+//#define FETCH_DUMP_VERTEX 1 -+ -+bool isComponentEnabled(ComponentEnable enableMask, uint8_t component); -+ -+enum ConversionType -+{ -+ CONVERT_NONE, -+ CONVERT_NORMALIZED, -+ CONVERT_USCALED, -+ CONVERT_SSCALED, -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// Interface to Jitting a fetch shader -+////////////////////////////////////////////////////////////////////////// -+struct FetchJit : public Builder -+{ -+ FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){}; -+ -+ Function* Create(const FETCH_COMPILE_STATE& fetchState); -+ Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); -+ Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); -+ Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); -+ -+ // package up Shuffle*bpcGatherd args into a tuple for convenience -+ typedef std::tuple Shuffle8bpcArgs; -+ void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); -+ -+ typedef std::tuple Shuffle16bpcArgs; -+ void Shuffle16bpcGather(Shuffle16bpcArgs &args); -+ -+ void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); -+ -+ Value* GenerateCompCtrlVector(const ComponentControl ctrl); -+ -+ void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut); -+ void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut); -+}; -+ -+Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) -+{ -+ static std::size_t fetchNum = 0; -+ -+ std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); -+ fnName << fetchNum++; -+ -+ Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); -+ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); -+ -+ IRB()->SetInsertPoint(entry); -+ -+ auto argitr = fetch->getArgumentList().begin(); -+ -+ // Fetch shader arguments -+ Value* fetchInfo = argitr; ++argitr; -+ fetchInfo->setName("fetchInfo"); -+ Value* pVtxOut = argitr; -+ pVtxOut->setName("vtxOutput"); -+ // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex -+ // index 0(just the pointer to the simdvertex structure -+ // index 1(which element of the simdvertex structure to offset to(in this case 0) -+ // so the indices being i32's doesn't matter -+ // TODO: generated this GEP with a VECTOR structure type so this makes sense -+ std::vector vtxInputIndices(2, C(0)); -+ // GEP -+ pVtxOut = GEP(pVtxOut, C(0)); -+ pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0)); -+ -+ // SWR_FETCH_CONTEXT::pStreams -+ Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); -+ streams->setName("pStreams"); -+ -+ // SWR_FETCH_CONTEXT::pIndices -+ Value* indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices}); -+ indices->setName("pIndices"); -+ -+ // SWR_FETCH_CONTEXT::pLastIndex -+ Value* pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex}); -+ pLastIndex->setName("pLastIndex"); -+ -+ -+ Value* vIndices; -+ switch(fetchState.indexType) -+ { -+ case R8_UINT: -+ indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); -+ if(fetchState.bDisableIndexOOBCheck){ -+ vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); -+ vIndices = Z_EXT(vIndices, mSimdInt32Ty); -+ } -+ else{ -+ pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0)); -+ vIndices = GetSimdValid8bitIndices(indices, pLastIndex); -+ } -+ break; -+ case R16_UINT: -+ indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0)); -+ if(fetchState.bDisableIndexOOBCheck){ -+ vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); -+ vIndices = Z_EXT(vIndices, mSimdInt32Ty); -+ } -+ else{ -+ pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0)); -+ vIndices = GetSimdValid16bitIndices(indices, pLastIndex); -+ } -+ break; -+ case R32_UINT: -+ (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0}) -+ : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); -+ break; // incoming type is already 32bit int -+ default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break; -+ } -+ -+ // store out vertex IDs -+ STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })); -+ -+ // store out cut mask if enabled -+ if (fetchState.bEnableCutIndex) -+ { -+ Value* vCutIndex = VIMMED1(fetchState.cutIndex); -+ Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); -+ STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask })); -+ } -+ -+ // Fetch attributes from memory and output to a simdvertex struct -+ // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use -+ (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut) -+ : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut); -+ -+ RET_VOID(); -+ -+ //#define KNOB_SWRC_TRACING -+ -+#if defined(KNOB_SWRC_TRACING) -+ std::string err; -+ char fName[1024]; -+ const char *funcName = fetch->getName().data(); -+ sprintf(fName, "%s.ll", funcName); -+ raw_fd_ostream fetchFD(fName, err, LLVM_F_NONE); -+ fetch->print(fetchFD); -+ fetchFD.flush(); -+#endif -+ verifyFunction(*fetch); -+ -+ FunctionPassManager setupPasses(JM()->mpCurrentModule); -+ -+ ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) -+ setupPasses.add(createBreakCriticalEdgesPass()); -+ setupPasses.add(createCFGSimplificationPass()); -+ setupPasses.add(createEarlyCSEPass()); -+ setupPasses.add(createPromoteMemoryToRegisterPass()); -+ -+ setupPasses.run(*fetch); -+ -+#if defined(KNOB_SWRC_TRACING) -+ sprintf(fName, "%s.se.ll", funcName); -+ raw_fd_ostream seFetchFD(fName, err, LLVM_F_NONE); -+ fetch->print(seFetchFD); -+ seFetchFD.flush(); -+#endif -+ -+ FunctionPassManager optPasses(JM()->mpCurrentModule); -+ -+ ///@todo Haven't touched these either. Need to remove some of these and add others. -+ optPasses.add(createCFGSimplificationPass()); -+ optPasses.add(createEarlyCSEPass()); -+ optPasses.add(createInstructionCombiningPass()); -+ optPasses.add(createInstructionSimplifierPass()); -+ optPasses.add(createConstantPropagationPass()); -+ optPasses.add(createSCCPPass()); -+ optPasses.add(createAggressiveDCEPass()); -+ -+ optPasses.run(*fetch); -+ optPasses.run(*fetch); -+ -+#if defined(KNOB_SWRC_TRACING) -+ sprintf(fName, "%s.opt.ll", funcName); -+ raw_fd_ostream optFetchFD(fName, err, LLVM_F_NONE); -+ fetch->print(optFetchFD); -+ optFetchFD.flush(); -+#endif -+ -+ return fetch; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Loads attributes from memory using LOADs, shuffling the -+/// components into SOA form. -+/// *Note* currently does not support component control, -+/// component packing, or instancing -+/// @param fetchState - info about attributes to be fetched from memory -+/// @param streams - value pointer to the current vertex stream -+/// @param vIndices - vector value of indices to load -+/// @param pVtxOut - value pointer to output simdvertex struct -+void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut) -+{ -+ // Zack shuffles; a variant of the Charleston. -+ -+ SWRL::UncheckedFixedVector vectors; -+ -+ std::vector pMask(JM()->mVWidth); -+ for(uint32_t i = 0; i < JM()->mVWidth; ++i) -+ { -+ pMask[i] = (C(i < 4 ? i : 4)); -+ } -+ Constant* promoteMask = ConstantVector::get(pMask); -+ Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4)); -+ -+ Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); -+ -+ for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt) -+ { -+ Value* elements[4] = {0}; -+ const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt]; -+ const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); -+ uint32_t numComponents = info.numComps; -+ uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. -+ -+ vectors.clear(); -+ -+ // load SWR_VERTEX_BUFFER_STATE::pData -+ Value *stream = LOAD(streams, {ied.StreamIndex, 2}); -+ -+ // load SWR_VERTEX_BUFFER_STATE::pitch -+ Value *stride = LOAD(streams, {ied.StreamIndex, 1}); -+ stride = Z_EXT(stride, mInt64Ty); -+ -+ // load SWR_VERTEX_BUFFER_STATE::size -+ Value *size = LOAD(streams, {ied.StreamIndex, 3}); -+ size = Z_EXT(size, mInt64Ty); -+ -+ Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride); -+ -+ // Load from the stream. -+ for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane) -+ { -+ // Get index -+ Value* index = VEXTRACT(vIndices, C(lane)); -+ index = Z_EXT(index, mInt64Ty); -+ -+ Value* offset = MUL(index, stride); -+ offset = ADD(offset, C((int64_t)ied.AlignedByteOffset)); -+ offset = ADD(offset, startVertexOffset); -+ -+ if (!fetchState.bDisableIndexOOBCheck) { -+ // check for out of bound access, including partial OOB, and mask them to 0 -+ Value *endOffset = ADD(offset, C((int64_t)info.Bpp)); -+ Value *oob = ICMP_ULE(endOffset, size); -+ offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0)); -+ } -+ -+ Value* pointer = GEP(stream, offset); -+ // We use a full-lane, but don't actually care. -+ Value* vptr = 0; -+ -+ // get a pointer to a 4 component attrib in default address space -+ switch(bpc) -+ { -+ case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break; -+ case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break; -+ case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break; -+ default: SWR_ASSERT(false, "Unsupported underlying bpp!"); -+ } -+ -+ // load 4 components of attribute -+ Value* vec = ALIGNED_LOAD(vptr, 1, false); -+ -+ // Convert To FP32 internally -+ switch(info.type[0]) -+ { -+ case SWR_TYPE_UNORM: -+ switch(bpc) -+ { -+ case 8: -+ vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); -+ vec = FMUL(vec, ConstantVector::get(std::vector(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0)))); -+ break; -+ case 16: -+ vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); -+ vec = FMUL(vec, ConstantVector::get(std::vector(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0)))); -+ break; -+ default: -+ SWR_ASSERT(false, "Unsupported underlying type!"); -+ break; -+ } -+ break; -+ case SWR_TYPE_SNORM: -+ switch(bpc) -+ { -+ case 8: -+ vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); -+ vec = FMUL(vec, ConstantVector::get(std::vector(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0)))); -+ break; -+ case 16: -+ vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); -+ vec = FMUL(vec, ConstantVector::get(std::vector(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0)))); -+ break; -+ default: -+ SWR_ASSERT(false, "Unsupported underlying type!"); -+ break; -+ } -+ break; -+ case SWR_TYPE_UINT: -+ // Zero extend uint32_t types. -+ switch(bpc) -+ { -+ case 8: -+ case 16: -+ vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4)); -+ vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); -+ break; -+ case 32: -+ break; // Pass through unchanged. -+ default: -+ SWR_ASSERT(false, "Unsupported underlying type!"); -+ break; -+ } -+ break; -+ case SWR_TYPE_SINT: -+ // Sign extend SINT types. -+ switch(bpc) -+ { -+ case 8: -+ case 16: -+ vec = S_EXT(vec, VectorType::get(mInt32Ty, 4)); -+ vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); -+ break; -+ case 32: -+ break; // Pass through unchanged. -+ default: -+ SWR_ASSERT(false, "Unsupported underlying type!"); -+ break; -+ } -+ break; -+ case SWR_TYPE_FLOAT: -+ switch(bpc) -+ { -+ case 32: -+ break; // Pass through unchanged. -+ default: -+ SWR_ASSERT(false, "Unsupported underlying type!"); -+ } -+ break; -+ case SWR_TYPE_USCALED: -+ vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); -+ break; -+ case SWR_TYPE_SSCALED: -+ vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); -+ break; -+ case SWR_TYPE_UNKNOWN: -+ case SWR_TYPE_UNUSED: -+ SWR_ASSERT(false, "Unsupported type %d!", info.type[0]); -+ } -+ -+ // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4) -+ // uwvec: 4 x F32, undef value -+ Value* wvec = VSHUFFLE(vec, uwvec, promoteMask); -+ vectors.push_back(wvec); -+ } -+ -+ std::vector v01Mask(JM()->mVWidth); -+ std::vector v23Mask(JM()->mVWidth); -+ std::vector v02Mask(JM()->mVWidth); -+ std::vector v13Mask(JM()->mVWidth); -+ -+ // Concatenate the vectors together. -+ elements[0] = VUNDEF_F(); -+ elements[1] = VUNDEF_F(); -+ elements[2] = VUNDEF_F(); -+ elements[3] = VUNDEF_F(); -+ for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b) -+ { -+ v01Mask[4 * b + 0] = C(0 + 4 * b); -+ v01Mask[4 * b + 1] = C(1 + 4 * b); -+ v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); -+ v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth); -+ -+ v23Mask[4 * b + 0] = C(2 + 4 * b); -+ v23Mask[4 * b + 1] = C(3 + 4 * b); -+ v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth); -+ v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); -+ -+ v02Mask[4 * b + 0] = C(0 + 4 * b); -+ v02Mask[4 * b + 1] = C(2 + 4 * b); -+ v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); -+ v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth); -+ -+ v13Mask[4 * b + 0] = C(1 + 4 * b); -+ v13Mask[4 * b + 1] = C(3 + 4 * b); -+ v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth); -+ v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); -+ -+ std::vector iMask(JM()->mVWidth); -+ for(uint32_t i = 0; i < JM()->mVWidth; ++i) -+ { -+ if(((4 * b) <= i) && (i < (4 * (b + 1)))) -+ { -+ iMask[i] = C(i % 4 + JM()->mVWidth); -+ } -+ else -+ { -+ iMask[i] = C(i); -+ } -+ } -+ Constant* insertMask = ConstantVector::get(iMask); -+ elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask); -+ elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask); -+ elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask); -+ elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask); -+ } -+ -+ Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask)); -+ Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask)); -+ Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask)); -+ Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask)); -+ elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask)); -+ elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask)); -+ elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask)); -+ elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask)); -+ -+ switch(numComponents + 1) -+ { -+ case 1: elements[0] = VIMMED1(0.0f); -+ case 2: elements[1] = VIMMED1(0.0f); -+ case 3: elements[2] = VIMMED1(0.0f); -+ case 4: elements[3] = VIMMED1(1.0f); -+ } -+ -+ for(uint32_t c = 0; c < 4; ++c) -+ { -+ Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP"); -+ STORE(elements[c], dest); -+ } -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Loads attributes from memory using AVX2 GATHER(s) -+/// @param fetchState - info about attributes to be fetched from memory -+/// @param fetchInfo - first argument passed to fetch shader -+/// @param streams - value pointer to the current vertex stream -+/// @param vIndices - vector value of indices to gather -+/// @param pVtxOut - value pointer to output simdvertex struct -+void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, -+ Value* streams, Value* vIndices, Value* pVtxOut) -+{ -+ uint32_t currentVertexElement = 0; -+ uint32_t outputElt = 0; -+ Value* vVertexElements[4]; -+ -+ Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); -+ Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); -+ Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); -+ Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); -+ curInstance->setName("curInstance"); -+ -+ for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt) -+ { -+ const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; -+ const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); -+ uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. -+ -+ Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData}); -+ -+ // VGATHER* takes an *i8 src pointer -+ Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0)); -+ -+ Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); -+ Value *vStride = VBROADCAST(stride); -+ -+ // max vertex index that is fully in bounds -+ Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); -+ maxVertex = LOAD(maxVertex); -+ -+ Value *vCurIndices; -+ Value *startOffset; -+ if(ied.InstanceEnable) -+ { -+ Value* stepRate = C(ied.InstanceDataStepRate); -+ -+ // prevent a div by 0 for 0 step rate -+ Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); -+ stepRate = SELECT(isNonZeroStep, stepRate, C(1)); -+ -+ // calc the current offset into instanced data buffer -+ Value* calcInstance = UDIV(curInstance, stepRate); -+ -+ // if step rate is 0, every instance gets instance 0 -+ calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); -+ -+ vCurIndices = VBROADCAST(calcInstance); -+ -+ startOffset = startInstance; -+ } -+ else -+ { -+ // offset indices by baseVertex -+ vCurIndices = ADD(vIndices, vBaseVertex); -+ -+ startOffset = startVertex; -+ } -+ -+ // All of the OOB calculations are in vertices, not VB offsets, to prevent having to -+ // do 64bit address offset calculations. -+ -+ // calculate byte offset to the start of the VB -+ Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); -+ pStreamBase = GEP(pStreamBase, baseOffset); -+ -+ // if we have a start offset, subtract from max vertex. Used for OOB check -+ maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); -+ Value* neg = ICMP_SLT(maxVertex, C((int64_t)0)); -+ // if we have a negative value, we're already OOB. clamp at 0. -+ maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty)); -+ -+ // Load the in bounds size of a partially valid vertex -+ Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); -+ partialInboundsSize = LOAD(partialInboundsSize); -+ Value* vPartialVertexSize = VBROADCAST(partialInboundsSize); -+ Value* vBpp = VBROADCAST(C(info.Bpp)); -+ -+ // is the element is <= the partially valid size -+ Value* vElementInBoundsMask = ICMP_ULE(vBpp, vPartialVertexSize); -+ -+ // are vertices partially OOB? -+ Value* vMaxVertex = VBROADCAST(maxVertex); -+ Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); -+ -+ // are vertices are fully in bounds? -+ Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); -+ -+ // blend in any partially OOB indices that have valid elements -+ vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); -+ vGatherMask = VMASK(vGatherMask); -+ -+ // calculate the actual offsets into the VB -+ Value* vOffsets = MUL(vCurIndices, vStride); -+ Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); -+ vOffsets = ADD(vOffsets, vAlignmentOffsets); -+ -+ // Packing and component control -+ ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; -+ const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, -+ (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3}; -+ -+ if(info.type[0] == SWR_TYPE_FLOAT) -+ { -+ ///@todo: support 64 bit vb accesses -+ Value* gatherSrc = VIMMED1(0.0f); -+ -+ // Gather components from memory to store in a simdvertex structure -+ switch(bpc) -+ { -+ case 16: -+ { -+ Value* vGatherResult[2]; -+ Value *vMask; -+ -+ // if we have at least one component out of x or y to fetch -+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ -+ // save mask as it is zero'd out after each gather -+ vMask = vGatherMask; -+ -+ vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); -+ // e.g. result of first 8x32bit integer gather for 16bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy -+ // -+ } -+ -+ // if we have at least one component out of z or w to fetch -+ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ -+ // offset base to the next components(zw) in the vertex to gather -+ pStreamBase = GEP(pStreamBase, C((char)4)); -+ vMask = vGatherMask; -+ -+ vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); -+ // e.g. result of second 8x32bit integer gather for 16bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw -+ // -+ } -+ -+ // if we have at least one component to shuffle into place -+ if(compMask){ -+ Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, -+ currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); -+ // Shuffle gathered components into place in simdvertex struct -+ Shuffle16bpcGather(args); // outputs to vVertexElements ref -+ } -+ } -+ break; -+ case 32: -+ { -+ for(uint32_t i = 0; i < 4; i++) -+ { -+ if(!isComponentEnabled(compMask, i)){ -+ // offset base to the next component in the vertex to gather -+ pStreamBase = GEP(pStreamBase, C((char)4)); -+ continue; -+ } -+ -+ // if we need to gather the component -+ if(compCtrl[i] == StoreSrc){ -+ // save mask as it is zero'd out after each gather -+ Value *vMask = vGatherMask; -+ -+ // Gather a SIMD of vertices -+ vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); -+ } -+ else{ -+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -+ } -+ -+ if(currentVertexElement > 3){ -+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); -+ // reset to the next vVertexElement to output -+ currentVertexElement = 0; -+ } -+ -+ // offset base to the next component in the vertex to gather -+ pStreamBase = GEP(pStreamBase, C((char)4)); -+ } -+ } -+ break; -+ default: -+ SWR_ASSERT(0, "Tried to fetch invalid FP format"); -+ break; -+ } -+ } -+ else -+ { -+ Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd; -+ ConversionType conversionType = CONVERT_NONE; -+ -+ switch(info.type[0]) -+ { -+ case SWR_TYPE_UNORM: -+ conversionType = CONVERT_NORMALIZED; -+ case SWR_TYPE_UINT: -+ extendCastType = Instruction::CastOps::ZExt; -+ break; -+ case SWR_TYPE_SNORM: -+ conversionType = CONVERT_NORMALIZED; -+ case SWR_TYPE_SINT: -+ extendCastType = Instruction::CastOps::SExt; -+ break; -+ case SWR_TYPE_USCALED: -+ conversionType = CONVERT_USCALED; -+ extendCastType = Instruction::CastOps::UIToFP; -+ break; -+ case SWR_TYPE_SSCALED: -+ conversionType = CONVERT_SSCALED; -+ extendCastType = Instruction::CastOps::SIToFP; -+ break; -+ default: -+ break; -+ } -+ -+ // value substituted when component of gather is masked -+ Value* gatherSrc = VIMMED1(0); -+ -+ // Gather components from memory to store in a simdvertex structure -+ switch (bpc) -+ { -+ case 8: -+ { -+ // if we have at least one component to fetch -+ if(compMask){ -+ Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); -+ // e.g. result of an 8x32bit integer gather for 8bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw -+ -+ Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, -+ currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); -+ // Shuffle gathered components into place in simdvertex struct -+ Shuffle8bpcGatherd(args); // outputs to vVertexElements ref -+ } -+ } -+ break; -+ case 16: -+ { -+ Value* vGatherResult[2]; -+ Value *vMask; -+ -+ // if we have at least one component out of x or y to fetch -+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ -+ // save mask as it is zero'd out after each gather -+ vMask = vGatherMask; -+ -+ vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); -+ // e.g. result of first 8x32bit integer gather for 16bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy -+ // -+ } -+ -+ // if we have at least one component out of z or w to fetch -+ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ -+ // offset base to the next components(zw) in the vertex to gather -+ pStreamBase = GEP(pStreamBase, C((char)4)); -+ vMask = vGatherMask; -+ -+ vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); -+ // e.g. result of second 8x32bit integer gather for 16bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw -+ // -+ } -+ -+ // if we have at least one component to shuffle into place -+ if(compMask){ -+ Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, -+ currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); -+ // Shuffle gathered components into place in simdvertex struct -+ Shuffle16bpcGather(args); // outputs to vVertexElements ref -+ } -+ } -+ break; -+ case 32: -+ { -+ SWR_ASSERT(conversionType == CONVERT_NONE); -+ -+ // Gathered components into place in simdvertex struct -+ for(uint32_t i = 0; i < 4; i++) -+ { -+ if(!isComponentEnabled(compMask, i)){ -+ // offset base to the next component in the vertex to gather -+ pStreamBase = GEP(pStreamBase, C((char)4)); -+ continue; -+ } -+ -+ // if we need to gather the component -+ if(compCtrl[i] == StoreSrc){ -+ // save mask as it is zero'd out after each gather -+ Value *vMask = vGatherMask; -+ -+ vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); -+ -+ // e.g. result of a single 8x32bit integer gather for 32bit components -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx -+ } -+ else{ -+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -+ } -+ -+ if(currentVertexElement > 3){ -+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); -+ // reset to the next vVertexElement to output -+ currentVertexElement = 0; -+ } -+ -+ // offset base to the next component in the vertex to gather -+ pStreamBase = GEP(pStreamBase, C((char)4)); -+ } -+ } -+ break; -+ } -+ } -+ } -+ -+ // if we have a partially filled vVertexElement struct, output it -+ if(currentVertexElement > 0){ -+ StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Loads a simd of valid indices. OOB indices are set to 0 -+/// *Note* have to do 16bit index checking in scalar until we have AVX-512 -+/// support -+/// @param pIndices - pointer to 8 bit indices -+/// @param pLastIndex - pointer to last valid index -+Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) -+{ -+ // can fit 2 16 bit integers per vWidth lane -+ Value* vIndices = VUNDEF_I(); -+ -+ // store 0 index on stack to be used to conditionally load from if index address is OOB -+ Value* pZeroIndex = ALLOCA(mInt8Ty); -+ STORE(C((uint8_t)0), pZeroIndex); -+ -+ // Load a SIMD of index pointers -+ for(int64_t lane = 0; lane < JM()->mVWidth; lane++) -+ { -+ // Calculate the address of the requested index -+ Value *pIndex = GEP(pIndices, C(lane)); -+ -+ // check if the address is less than the max index, -+ Value* mask = ICMP_ULT(pIndex, pLastIndex); -+ -+ // if valid, load the index. if not, load 0 from the stack -+ Value* pValid = SELECT(mask, pIndex, pZeroIndex); -+ Value *index = LOAD(pValid, "valid index"); -+ -+ // zero extended index to 32 bits and insert into the correct simd lane -+ index = Z_EXT(index, mInt32Ty); -+ vIndices = VINSERT(vIndices, index, lane); -+ } -+ return vIndices; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Loads a simd of valid indices. OOB indices are set to 0 -+/// *Note* have to do 16bit index checking in scalar until we have AVX-512 -+/// support -+/// @param pIndices - pointer to 16 bit indices -+/// @param pLastIndex - pointer to last valid index -+Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) -+{ -+ // can fit 2 16 bit integers per vWidth lane -+ Value* vIndices = VUNDEF_I(); -+ -+ // store 0 index on stack to be used to conditionally load from if index address is OOB -+ Value* pZeroIndex = ALLOCA(mInt16Ty); -+ STORE(C((uint16_t)0), pZeroIndex); -+ -+ // Load a SIMD of index pointers -+ for(int64_t lane = 0; lane < JM()->mVWidth; lane++) -+ { -+ // Calculate the address of the requested index -+ Value *pIndex = GEP(pIndices, C(lane)); -+ -+ // check if the address is less than the max index, -+ Value* mask = ICMP_ULT(pIndex, pLastIndex); -+ -+ // if valid, load the index. if not, load 0 from the stack -+ Value* pValid = SELECT(mask, pIndex, pZeroIndex); -+ Value *index = LOAD(pValid, "valid index"); -+ -+ // zero extended index to 32 bits and insert into the correct simd lane -+ index = Z_EXT(index, mInt32Ty); -+ vIndices = VINSERT(vIndices, index, lane); -+ } -+ return vIndices; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Loads a simd of valid indices. OOB indices are set to 0 -+/// @param pIndices - pointer to 32 bit indices -+/// @param pLastIndex - pointer to last valid index -+Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) -+{ -+ DataLayout dL(JM()->mpCurrentModule); -+ unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits -+ Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize)); -+ Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize)); -+ -+ // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index) -+ Value* numIndicesLeft = SUB(iLastIndex,iIndices); -+ numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); -+ numIndicesLeft = SDIV(numIndicesLeft, C(4)); -+ -+ // create a vector of index counts from the base index ptr passed into the fetch -+ const std::vector vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)}; -+ Constant* vIndexOffsets = ConstantVector::get(vecIndices); -+ -+ // compare index count to the max valid index -+ // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load -+ // vIndexOffsets 0 1 2 3 4 5 6 7 -+ // ------------------------------ -+ // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass -+ // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 -+ Value* vMaxIndex = VBROADCAST(numIndicesLeft); -+ Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets); -+ -+ // VMASKLOAD takes an *i8 src pointer -+ pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0)); -+ -+ // Load the indices; OOB loads 0 -+ return MASKLOADD(pIndices,vIndexMask); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, -+/// denormalizes if needed, converts to F32 if needed, and positions in -+// the proper SIMD rows to be output to the simdvertex structure -+/// @param args: (tuple of args, listed below) -+/// @param vGatherResult - 8 gathered 8bpc vertices -+/// @param pVtxOut - base pointer to output simdvertex struct -+/// @param extendType - sign extend or zero extend -+/// @param bNormalized - do we need to denormalize? -+/// @param currentVertexElement - reference to the current vVertexElement -+/// @param outputElt - reference to the current offset from simdvertex we're o -+/// @param compMask - component packing mask -+/// @param compCtrl - component control val -+/// @param vVertexElements[4] - vertex components to output -+/// @param swizzle[4] - component swizzle location -+void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) -+{ -+ // Unpack tuple args -+ Value*& vGatherResult = std::get<0>(args); -+ Value* pVtxOut = std::get<1>(args); -+ const Instruction::CastOps extendType = std::get<2>(args); -+ const ConversionType conversionType = std::get<3>(args); -+ uint32_t ¤tVertexElement = std::get<4>(args); -+ uint32_t &outputElt = std::get<5>(args); -+ const ComponentEnable compMask = std::get<6>(args); -+ const ComponentControl (&compCtrl)[4] = std::get<7>(args); -+ Value* (&vVertexElements)[4] = std::get<8>(args); -+ const uint32_t (&swizzle)[4] = std::get<9>(args); -+ -+ // cast types -+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); -+ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits -+ -+ // have to do extra work for sign extending -+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){ -+ Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane -+ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits -+ -+ // shuffle mask, including any swizzling -+ const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; -+ const char z = (char)swizzle[2]; const char w = (char)swizzle[3]; -+ Value* vConstMask = C({char(x), char(x+4), char(x+8), char(x+12), -+ char(y), char(y+4), char(y+8), char(y+12), -+ char(z), char(z+4), char(z+8), char(z+12), -+ char(w), char(w+4), char(w+8), char(w+12), -+ char(x), char(x+4), char(x+8), char(x+12), -+ char(y), char(y+4), char(y+8), char(y+12), -+ char(z), char(z+4), char(z+8), char(z+12), -+ char(w), char(w+4), char(w+8), char(w+12)}); -+ -+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); -+ // after pshufb: group components together in each 128bit lane -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww -+ -+ Value* vi128XY = nullptr; -+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ -+ vi128XY = BITCAST(PERMD(vShufResult, C({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); -+ // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) -+ } -+ -+ // do the same for zw components -+ Value* vi128ZW = nullptr; -+ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ -+ vi128ZW = BITCAST(PERMD(vShufResult, C({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); -+ } -+ -+ // init denormalize variables if needed -+ Instruction::CastOps fpCast; -+ Value* conversionFactor; -+ -+ switch (conversionType) -+ { -+ case CONVERT_NORMALIZED: -+ fpCast = Instruction::CastOps::SIToFP; -+ conversionFactor = VIMMED1((float)(1.0 / 127.0)); -+ break; -+ case CONVERT_SSCALED: -+ fpCast = Instruction::CastOps::SIToFP; -+ conversionFactor = VIMMED1((float)(1.0)); -+ break; -+ case CONVERT_USCALED: -+ SWR_ASSERT(0, "Type should not be sign extended!"); -+ conversionFactor = nullptr; -+ break; -+ default: -+ SWR_ASSERT(conversionType == CONVERT_NONE); -+ conversionFactor = nullptr; -+ break; -+ } -+ -+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex -+ for(uint32_t i = 0; i < 4; i++){ -+ if(!isComponentEnabled(compMask, i)){ -+ continue; -+ } -+ -+ if(compCtrl[i] == ComponentControl::StoreSrc){ -+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 -+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; -+ // if x or y, use vi128XY permute result, else use vi128ZW -+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; -+ -+ // sign extend -+ vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty)); -+ -+ // denormalize if needed -+ if(conversionType != CONVERT_NONE){ -+ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); -+ } -+ currentVertexElement++; -+ } -+ else{ -+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -+ } -+ -+ if(currentVertexElement > 3){ -+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); -+ // reset to the next vVertexElement to output -+ currentVertexElement = 0; -+ } -+ } -+ } -+ // else zero extend -+ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) -+ { -+ // init denormalize variables if needed -+ Instruction::CastOps fpCast; -+ Value* conversionFactor; -+ -+ switch (conversionType) -+ { -+ case CONVERT_NORMALIZED: -+ fpCast = Instruction::CastOps::UIToFP; -+ conversionFactor = VIMMED1((float)(1.0 / 255.0)); -+ break; -+ case CONVERT_USCALED: -+ fpCast = Instruction::CastOps::UIToFP; -+ conversionFactor = VIMMED1((float)(1.0)); -+ break; -+ case CONVERT_SSCALED: -+ SWR_ASSERT(0, "Type should not be zero extended!"); -+ conversionFactor = nullptr; -+ break; -+ default: -+ SWR_ASSERT(conversionType == CONVERT_NONE); -+ conversionFactor = nullptr; -+ break; -+ } -+ -+ // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits -+ for(uint32_t i = 0; i < 4; i++){ -+ if(!isComponentEnabled(compMask, i)){ -+ continue; -+ } -+ -+ if(compCtrl[i] == ComponentControl::StoreSrc){ -+ // pshufb masks for each component -+ Value* vConstMask; -+ switch(swizzle[i]){ -+ case 0: -+ // x shuffle mask -+ vConstMask = C({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, -+ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); -+ break; -+ case 1: -+ // y shuffle mask -+ vConstMask = C({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, -+ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); -+ break; -+ case 2: -+ // z shuffle mask -+ vConstMask = C({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, -+ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); -+ break; -+ case 3: -+ // w shuffle mask -+ vConstMask = C({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, -+ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); -+ break; -+ default: -+ vConstMask = nullptr; -+ break; -+ } -+ -+ vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); -+ // after pshufb for x channel -+ // 256i - 0 1 2 3 4 5 6 7 -+ // x000 x000 x000 x000 x000 x000 x000 x000 -+ -+ // denormalize if needed -+ if (conversionType != CONVERT_NONE){ -+ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); -+ } -+ currentVertexElement++; -+ } -+ else{ -+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -+ } -+ -+ if(currentVertexElement > 3){ -+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); -+ // reset to the next vVertexElement to output -+ currentVertexElement = 0; -+ } -+ } -+ } -+ else -+ { -+ SWR_ASSERT(0, "Unsupported conversion type"); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, -+/// denormalizes if needed, converts to F32 if needed, and positions in -+// the proper SIMD rows to be output to the simdvertex structure -+/// @param args: (tuple of args, listed below) -+/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index -+/// @param pVtxOut - base pointer to output simdvertex struct -+/// @param extendType - sign extend or zero extend -+/// @param bNormalized - do we need to denormalize? -+/// @param currentVertexElement - reference to the current vVertexElement -+/// @param outputElt - reference to the current offset from simdvertex we're o -+/// @param compMask - component packing mask -+/// @param compCtrl - component control val -+/// @param vVertexElements[4] - vertex components to output -+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) -+{ -+ // Unpack tuple args -+ Value* (&vGatherResult)[2] = std::get<0>(args); -+ Value* pVtxOut = std::get<1>(args); -+ const Instruction::CastOps extendType = std::get<2>(args); -+ const ConversionType conversionType = std::get<3>(args); -+ uint32_t ¤tVertexElement = std::get<4>(args); -+ uint32_t &outputElt = std::get<5>(args); -+ const ComponentEnable compMask = std::get<6>(args); -+ const ComponentControl(&compCtrl)[4] = std::get<7>(args); -+ Value* (&vVertexElements)[4] = std::get<8>(args); -+ -+ // cast types -+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); -+ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits -+ -+ // have to do extra work for sign extending -+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| -+ (extendType == Instruction::CastOps::FPExt)) -+ { -+ // is this PP float? -+ bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; -+ -+ Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane -+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits -+ -+ // shuffle mask -+ Value* vConstMask = C({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, -+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); -+ Value* vi128XY = nullptr; -+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ -+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); -+ // after pshufb: group components together in each 128bit lane -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy -+ -+ vi128XY = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); -+ // after PERMD: move and pack xy components into each 128bit lane -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy -+ } -+ -+ // do the same for zw components -+ Value* vi128ZW = nullptr; -+ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ -+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); -+ vi128ZW = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); -+ } -+ -+ // init denormalize variables if needed -+ Instruction::CastOps IntToFpCast; -+ Value* conversionFactor; -+ -+ switch (conversionType) -+ { -+ case CONVERT_NORMALIZED: -+ IntToFpCast = Instruction::CastOps::SIToFP; -+ conversionFactor = VIMMED1((float)(1.0 / 32767.0)); -+ break; -+ case CONVERT_SSCALED: -+ IntToFpCast = Instruction::CastOps::SIToFP; -+ conversionFactor = VIMMED1((float)(1.0)); -+ break; -+ case CONVERT_USCALED: -+ SWR_ASSERT(0, "Type should not be sign extended!"); -+ conversionFactor = nullptr; -+ break; -+ default: -+ SWR_ASSERT(conversionType == CONVERT_NONE); -+ conversionFactor = nullptr; -+ break; -+ } -+ -+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex -+ for(uint32_t i = 0; i < 4; i++){ -+ if(!isComponentEnabled(compMask, i)){ -+ continue; -+ } -+ -+ if(compCtrl[i] == ComponentControl::StoreSrc){ -+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 -+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; -+ // if x or y, use vi128XY permute result, else use vi128ZW -+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; -+ -+ if(bFP) { -+ // extract 128 bit lanes to sign extend each component -+ vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); -+ } -+ else { -+ // extract 128 bit lanes to sign extend each component -+ vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); -+ -+ // denormalize if needed -+ if(conversionType != CONVERT_NONE){ -+ vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); -+ } -+ } -+ currentVertexElement++; -+ } -+ else{ -+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -+ } -+ -+ if(currentVertexElement > 3){ -+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); -+ // reset to the next vVertexElement to output -+ currentVertexElement = 0; -+ } -+ } -+ -+ } -+ // else zero extend -+ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) -+ { -+ // pshufb masks for each component -+ Value* vConstMask[2]; -+ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){ -+ // x/z shuffle mask -+ vConstMask[0] = C({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, -+ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); -+ } -+ -+ if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){ -+ // y/w shuffle mask -+ vConstMask[1] = C({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, -+ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); -+ } -+ -+ // init denormalize variables if needed -+ Instruction::CastOps fpCast; -+ Value* conversionFactor; -+ -+ switch (conversionType) -+ { -+ case CONVERT_NORMALIZED: -+ fpCast = Instruction::CastOps::UIToFP; -+ conversionFactor = VIMMED1((float)(1.0 / 65535.0)); -+ break; -+ case CONVERT_USCALED: -+ fpCast = Instruction::CastOps::UIToFP; -+ conversionFactor = VIMMED1((float)(1.0f)); -+ break; -+ case CONVERT_SSCALED: -+ SWR_ASSERT(0, "Type should not be zero extended!"); -+ conversionFactor = nullptr; -+ break; -+ default: -+ SWR_ASSERT(conversionType == CONVERT_NONE); -+ conversionFactor = nullptr; -+ break; -+ } -+ -+ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits -+ for(uint32_t i = 0; i < 4; i++){ -+ if(!isComponentEnabled(compMask, i)){ -+ continue; -+ } -+ -+ if(compCtrl[i] == ComponentControl::StoreSrc){ -+ // select correct constMask for x/z or y/w pshufb -+ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; -+ // if x or y, use vi128XY permute result, else use vi128ZW -+ uint32_t selectedGather = (i < 2) ? 0 : 1; -+ -+ vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); -+ // after pshufb mask for x channel; z uses the same shuffle from the second gather -+ // 256i - 0 1 2 3 4 5 6 7 -+ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 -+ -+ // denormalize if needed -+ if(conversionType != CONVERT_NONE){ -+ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); -+ } -+ currentVertexElement++; -+ } -+ else{ -+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -+ } -+ -+ if(currentVertexElement > 3){ -+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); -+ // reset to the next vVertexElement to output -+ currentVertexElement = 0; -+ } -+ } -+ } -+ else -+ { -+ SWR_ASSERT(0, "Unsupported conversion type"); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Output a simdvertex worth of elements to the current outputElt -+/// @param pVtxOut - base address of VIN output struct -+/// @param outputElt - simdvertex offset in VIN to write to -+/// @param numEltsToStore - number of simdvertex rows to write out -+/// @param vVertexElements - LLVM Value*[] simdvertex to write out -+void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) -+{ -+ for(uint32_t c = 0; c < numEltsToStore; ++c) -+ { -+ // STORE expects FP32 x vWidth type, just bitcast if needed -+ if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){ -+#if FETCH_DUMP_VERTEX -+ PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); -+#endif -+ vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); -+ } -+#if FETCH_DUMP_VERTEX -+ else -+ { -+ PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); -+ } -+#endif -+ // outputElt * 4 = offsetting by the size of a simdvertex -+ // + c offsets to a 32bit x vWidth row within the current vertex -+ Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); -+ STORE(vVertexElements[c], dest); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Generates a constant vector of values based on the -+/// ComponentControl value -+/// @param ctrl - ComponentControl value -+Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) -+{ -+ switch(ctrl) -+ { -+ case NoStore: return VUNDEF_I(); -+ case Store0: return VIMMED1(0); -+ case Store1Fp: return VIMMED1(1.0f); -+ case Store1Int: return VIMMED1(1); -+ case StoreSrc: -+ default: SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I(); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Returns the enable mask for the specified component. -+/// @param enableMask - enable bits -+/// @param component - component to check if enabled. -+bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) -+{ -+ switch (component) -+ { -+ // X -+ case 0: return (enableMask & ComponentEnable::X); -+ // Y -+ case 1: return (enableMask & ComponentEnable::Y); -+ // Z -+ case 2: return (enableMask & ComponentEnable::Z); -+ // W -+ case 3: return (enableMask & ComponentEnable::W); -+ -+ default: return false; -+ } -+} -+ -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JITs from fetch shader IR -+/// @param hJitMgr - JitManager handle -+/// @param func - LLVM function IR -+/// @return PFN_FETCH_FUNC - pointer to fetch code -+PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) -+{ -+ const llvm::Function* func = (const llvm::Function*)hFunc; -+ JitManager* pJitMgr = reinterpret_cast(hJitMgr); -+ PFN_FETCH_FUNC pfnFetch; -+ -+ pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); -+ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module -+ pJitMgr->mIsModuleFinalized = true; -+ -+#if defined(KNOB_SWRC_TRACING) -+ char fName[1024]; -+ const char *funcName = func->getName().data(); -+ sprintf(fName, "%s.bin", funcName); -+ FILE *fd = fopen(fName, "wb"); -+ fwrite((void *)pfnFetch, 1, 2048, fd); -+ fclose(fd); -+#endif -+ -+ return pfnFetch; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JIT compiles fetch shader -+/// @param hJitMgr - JitManager handle -+/// @param state - fetch state to build function from -+extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state) -+{ -+ JitManager* pJitMgr = reinterpret_cast(hJitMgr); -+ -+ pJitMgr->SetupNewModule(); -+ -+ FetchJit theJit(pJitMgr); -+ HANDLE hFunc = theJit.Create(state); -+ -+ return JitFetchFunc(hJitMgr, hFunc); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h -new file mode 100644 -index 0000000..ea3625d ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h -@@ -0,0 +1,128 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file fetch_jit.h -+* -+* @brief Definition of the fetch jitter -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+ -+#include "common/formats.h" -+#include "core/state.h" -+ -+////////////////////////////////////////////////////////////////////////// -+/// INPUT_ELEMENT_DESC -+////////////////////////////////////////////////////////////////////////// -+struct INPUT_ELEMENT_DESC -+{ -+ union -+ { -+ struct -+ { -+ uint32_t AlignedByteOffset : 12; -+ uint32_t Format : 10; -+ uint32_t StreamIndex : 6; -+ uint32_t InstanceEnable : 1; -+ uint32_t ComponentControl0 : 3; -+ uint32_t ComponentControl1 : 3; -+ uint32_t ComponentControl2 : 3; -+ uint32_t ComponentControl3 : 3; -+ uint32_t ComponentPacking : 4; -+ uint32_t _reserved : 19; -+ }; -+ uint64_t bits; -+ }; -+ uint32_t InstanceDataStepRate; -+}; -+ -+// used to set ComponentPacking -+enum ComponentEnable -+{ -+ NONE = 0x0, -+ X = 0x1, -+ Y = 0x2, -+ XY = 0x3, -+ Z = 0x4, -+ XZ = 0x5, -+ YZ = 0x6, -+ XYZ = 0x7, -+ W = 0x8, -+ XW = 0x9, -+ YW = 0xA, -+ XYW = 0xB, -+ ZW = 0xC, -+ XZW = 0xD, -+ YZW = 0xE, -+ XYZW = 0xF, -+}; -+ -+enum ComponentControl -+{ -+ NoStore = 0, -+ StoreSrc = 1, -+ Store0 = 2, -+ Store1Fp = 3, -+ Store1Int = 4, -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// State required for fetch shader jit compile. -+////////////////////////////////////////////////////////////////////////// -+struct FETCH_COMPILE_STATE -+{ -+ uint32_t numAttribs; -+ INPUT_ELEMENT_DESC layout[KNOB_NUM_ATTRIBUTES]; -+ SWR_FORMAT indexType; -+ uint32_t cutIndex{ 0xffffffff }; -+ -+ // Options that effect the JIT'd code -+ bool bDisableVGATHER; // if enabled, FetchJit will generate loads/shuffles instead of VGATHERs -+ bool bDisableIndexOOBCheck; // if enabled, FetchJit will exclude index OOB check -+ bool bEnableCutIndex{ false }; // compares indices with the cut index and returns a cut mask -+ -+ FETCH_COMPILE_STATE(bool useVGATHER = false, bool indexOOBCheck = false) : -+ bDisableVGATHER(useVGATHER), bDisableIndexOOBCheck(indexOOBCheck){}; -+ -+ bool operator==(const FETCH_COMPILE_STATE &other) const -+ { -+ if (numAttribs != other.numAttribs) return false; -+ if (indexType != other.indexType) return false; -+ if (bDisableVGATHER != other.bDisableVGATHER) return false; -+ if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) return false; -+ if (bEnableCutIndex != other.bEnableCutIndex) return false; -+ if (cutIndex != other.cutIndex) return false; -+ -+ for(uint32_t i = 0; i < numAttribs; ++i) -+ { -+ if((layout[i].bits != other.layout[i].bits) || -+ ((layout[i].InstanceEnable == 1) && -+ (layout[i].InstanceDataStepRate != other.layout[i].InstanceDataStepRate))){ -+ return false; -+ } -+ } -+ -+ return true; -+ } -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h -new file mode 100644 -index 0000000..afa33bb ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h -@@ -0,0 +1,105 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file jit_api.h -+* -+* @brief Platform independent JIT interface -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+#include "common/os.h" -+ -+#include "fetch_jit.h" -+#include "streamout_jit.h" -+#include "blend_jit.h" -+ -+#if defined(_WIN32) -+#define JITCALL __stdcall -+#else -+#define JITCALL -+#endif -+ -+extern "C" -+{ -+ -+struct ShaderInfo; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Create JIT context. -+HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Destroy JIT context. -+void JITCALL JitDestroyContext(HANDLE hJitContext); -+ -+////////////////////////////////////////////////////////////////////////// -+/// Jit Compile Info Input -+////////////////////////////////////////////////////////////////////////// -+struct JIT_COMPILE_INPUT -+{ -+ SWR_SHADER_TYPE type; -+ -+ const void* pIR; ///< Pointer to LLVM IR text. -+ -+ bool enableJitSampler; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JIT compile shader. -+/// @param hJitContext - Jit Context -+/// @param input - Input containing LLVM IR and other information -+/// @param output - Output containing information about JIT shader -+/// @return HANDLE - pointer to shader object. -+HANDLE JITCALL JitCompileShader( -+ HANDLE hJitContext, -+ const JIT_COMPILE_INPUT& input, -+ ShaderInfo& output); ///@todo Move ShaderInfo into Jitter. -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JIT destroy shader. -+/// @param hJitContext - Jit Context -+/// @param hShader - pointer to shader object. -+void JITCALL JitDestroyShader( -+ HANDLE hJitContext, -+ HANDLE hShader); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JIT compiles fetch shader -+/// @param hJitContext - Jit Context -+/// @param state - Fetch state to build function from -+PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitContext, const FETCH_COMPILE_STATE& state); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JIT compiles streamout shader -+/// @param hJitContext - Jit Context -+/// @param state - SO state to build function from -+PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitContext, const STREAMOUT_COMPILE_STATE& state); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JIT compiles blend shader -+/// @param hJitContext - Jit Context -+/// @param state - blend state to build function from -+PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitContext, const BLEND_COMPILE_STATE& state); -+ -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py -new file mode 100644 -index 0000000..268871b ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py -@@ -0,0 +1,334 @@ -+# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+# -+# Permission is hereby granted, free of charge, to any person obtaining a -+# copy of this software and associated documentation files (the "Software"), -+# to deal in the Software without restriction, including without limitation -+# the rights to use, copy, modify, merge, publish, distribute, sublicense, -+# and/or sell copies of the Software, and to permit persons to whom the -+# Software is furnished to do so, subject to the following conditions: -+# -+# The above copyright notice and this permission notice (including the next -+# paragraph) shall be included in all copies or substantial portions of the -+# Software. -+# -+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+# IN THE SOFTWARE. -+ -+#!deps/python32/python.exe -+ -+import os, sys, re -+import argparse -+import json as JSON -+import operator -+ -+header = r""" -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file %s -+* -+* @brief auto-generated file -+* -+* DO NOT EDIT -+* -+******************************************************************************/ -+ -+#pragma once -+ -+""" -+ -+""" -+""" -+def gen_file_header(filename): -+ global header -+ headerStr = header % filename -+ return headerStr.splitlines() -+ -+""" -+""" -+def gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file): -+ -+ llvm_type = '' -+ -+ if is_llvm_struct: -+ if is_pointer or is_pointer_pointer: -+ llvm_type = 'Type::getInt32Ty(ctx)' -+ else: -+ llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type -+ elif is_llvm_enum: -+ llvm_type = 'Type::getInt32Ty(ctx)' -+ elif is_llvm_pfn: -+ llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)' -+ else: -+ if type == "BYTE" or type == "char" or type == "uint8_t" or type == "int8_t" or type == 'bool': -+ llvm_type = 'Type::getInt8Ty(ctx)' -+ elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t': -+ llvm_type = 'Type::getInt64Ty(ctx)' -+ elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t': -+ llvm_type = 'Type::getInt16Ty(ctx)' -+ elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t': -+ llvm_type = 'Type::getInt32Ty(ctx)' -+ elif type == 'float' or type == 'FLOAT': -+ llvm_type = 'Type::getFloatTy(ctx)' -+ elif type == 'double' or type == 'DOUBLE': -+ llvm_type = 'Type::getDoubleTy(ctx)' -+ elif type == 'void' or type == 'VOID': -+ llvm_type = 'Type::getInt32Ty(ctx)' -+ elif type == 'HANDLE': -+ llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)' -+ elif type == 'simdscalar': -+ llvm_type = 'VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth)' -+ elif type == 'simdscalari': -+ llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)' -+ elif type == 'simdvector': -+ llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth), 4)' -+ else: -+ llvm_type = 'Gen_%s%s(pJitMgr)' % (type, postfix_name) -+ -+ if is_pointer: -+ llvm_type = 'PointerType::get(%s, 0)' % llvm_type -+ -+ if is_pointer_pointer: -+ llvm_type = 'PointerType::get(%s, 0)' % llvm_type -+ -+ if is_array_array: -+ llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count) -+ elif is_array: -+ llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count) -+ -+ return [' members.push_back( %s ); // %s' % (llvm_type, name)] -+ -+""" -+""" -+def gen_llvm_types(input_file, output_file): -+ -+ output_lines = gen_file_header(os.path.basename(output_file.name)) -+ -+ lines = input_file.readlines() -+ -+ postfix_name = "" -+ -+ for idx in range(len(lines)): -+ line = lines[idx].rstrip() -+ -+ match = re.match(r"(\s*)struct(\s*)(\w+)", line) -+ if match: -+ llvm_args = [] -+ -+ # Detect start of structure -+ is_fwd_decl = re.search(r";", line) -+ -+ if not is_fwd_decl: -+ -+ # Extract the command name -+ struct_name = match.group(3).strip() -+ -+ output_lines += [ -+ '//////////////////////////////////////////////////////////////////////////', -+ '/// Generate LLVM type information for %s' % struct_name, -+ 'INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name), -+ '{', -+ ' LLVMContext& ctx = pJitMgr->mContext;', -+ ' std::vector members;', -+ '', -+ ] -+ -+ end_of_struct = False -+ -+ while not end_of_struct and idx < len(lines)-1: -+ idx += 1 -+ line = lines[idx].rstrip() -+ -+ ########################################### -+ # Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure. -+ is_llvm_struct = re.search(r"@llvm_struct", line) -+ -+ if is_llvm_struct is not None: -+ is_llvm_struct = True -+ else: -+ is_llvm_struct = False -+ -+ ########################################### -+ # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type. -+ is_llvm_enum = re.search(r"@llvm_enum", line) -+ -+ if is_llvm_enum is not None: -+ is_llvm_enum = True -+ else: -+ is_llvm_enum = False -+ -+ ########################################### -+ # Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type. -+ is_llvm_pfn = re.search(r"@llvm_pfn", line) -+ -+ if is_llvm_pfn is not None: -+ is_llvm_pfn = True -+ else: -+ is_llvm_pfn = False -+ -+ ########################################### -+ # Is field const? -+ is_const = re.search(r"\s+const\s+", line) -+ -+ if is_const is not None: -+ is_const = True -+ else: -+ is_const = False -+ -+ ########################################### -+ # Is field a pointer? -+ is_pointer_pointer = re.search("\*\*", line) -+ -+ if is_pointer_pointer is not None: -+ is_pointer_pointer = True -+ else: -+ is_pointer_pointer = False -+ -+ ########################################### -+ # Is field a pointer? -+ is_pointer = re.search("\*", line) -+ -+ if is_pointer is not None: -+ is_pointer = True -+ else: -+ is_pointer = False -+ -+ ########################################### -+ # Is field an array of arrays? -+ # TODO: Can add this to a list. -+ is_array_array = re.search("\[(\w*)\]\[(\w*)\]", line) -+ array_count = '0' -+ array_count1 = '0' -+ -+ if is_array_array is not None: -+ array_count = is_array_array.group(1) -+ array_count1 = is_array_array.group(2) -+ is_array_array = True -+ else: -+ is_array_array = False -+ -+ ########################################### -+ # Is field an array? -+ is_array = re.search("\[(\w*)\]", line) -+ -+ if is_array is not None: -+ array_count = is_array.group(1) -+ is_array = True -+ else: -+ is_array = False -+ -+ is_scoped = re.search("::", line) -+ -+ if is_scoped is not None: -+ is_scoped = True -+ else: -+ is_scoped = False -+ -+ type = None -+ name = None -+ if is_const and is_pointer: -+ -+ if is_scoped: -+ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)", line) -+ -+ type = "%s%s" % (field_match.group(4), field_match.group(5)) -+ name = field_match.group(7) -+ else: -+ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)", line) -+ -+ type = field_match.group(4) -+ name = field_match.group(6) -+ -+ elif is_pointer: -+ field_match = re.match(r"(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)", line) -+ -+ if field_match: -+ type = field_match.group(3) -+ name = field_match.group(5) -+ elif is_const: -+ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)", line) -+ -+ if field_match: -+ type = field_match.group(4) -+ name = field_match.group(6) -+ else: -+ if is_scoped: -+ field_match = re.match(r"\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)", line) -+ -+ if field_match: -+ type = field_match.group(1) + '::' + field_match.group(2) -+ name = field_match.group(3) -+ else: -+ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)", line) -+ -+ if field_match: -+ type = field_match.group(2) -+ name = field_match.group(4) -+ -+ if type is not None: -+ output_lines += gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file) -+ llvm_args.append(name) -+ -+ # Detect end of structure -+ end_of_struct = re.match(r"(\s*)};", line) -+ -+ if (end_of_struct): -+ output_lines += [ -+ '', -+ ' return StructType::get(ctx, members, false);', -+ '}', -+ '', -+ ] -+ -+ for i in range(len(llvm_args)): -+ output_lines.append('static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i)) -+ -+ output_lines.append('') -+ -+ output_file.write('\n'.join(output_lines) + '\n') -+ -+""" -+ Function which is invoked when this script is started from a command line. -+ Will present and consume a set of arguments which will tell this script how -+ to behave -+""" -+def main(): -+ -+ # Parse args... -+ parser = argparse.ArgumentParser() -+ parser.add_argument("--input", "-i", type=argparse.FileType('r'), -+ help="Path to input file containing structs", required=True) -+ parser.add_argument("--output", "-o", type=argparse.FileType('w'), -+ help="Path to output file", required=True) -+ parser.add_argument("--scalar", "-scalar", help="Generates scalar files with all enums", action="store_true", default=False) -+ args = parser.parse_args() -+ -+ gen_llvm_types(args.input, args.output) -+ -+if __name__ == '__main__': -+ main() -+# END OF FILE -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp -new file mode 100644 -index 0000000..6a64a1c ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp -@@ -0,0 +1,348 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file streamout_jit.cpp -+* -+* @brief Implementation of the streamout jitter -+* -+* Notes: -+* -+******************************************************************************/ -+#include "jit_api.h" -+#include "streamout_jit.h" -+#include "builder.h" -+#include "state_llvm.h" -+#include "common/containers.hpp" -+#include "llvm/IR/DataLayout.h" -+ -+#include -+#include -+ -+////////////////////////////////////////////////////////////////////////// -+/// Interface to Jitting a fetch shader -+////////////////////////////////////////////////////////////////////////// -+struct StreamOutJit : public Builder -+{ -+ StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){}; -+ -+ // returns pointer to SWR_STREAMOUT_BUFFER -+ Value* getSOBuffer(Value* pSoCtx, uint32_t buffer) -+ { -+ return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer }); -+ } -+ -+ -+ ////////////////////////////////////////////////////////////////////////// -+ // @brief checks if streamout buffer is oob -+ // @return true/false -+ Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer) -+ { -+ Value* returnMask = C(false); -+ -+ Value* pBuf = getSOBuffer(pSoCtx, buffer); -+ -+ // load enable -+ // @todo bool data types should generate llvm type -+ Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty()); -+ -+ // load buffer size -+ Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize }); -+ -+ // load current streamOffset -+ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); -+ -+ // load buffer pitch -+ Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); -+ -+ // buffer is considered oob if in use in a decl but not enabled -+ returnMask = OR(returnMask, NOT(enabled)); -+ -+ // buffer is oob if cannot fit a prims worth of verts -+ Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim))); -+ returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize)); -+ -+ return returnMask; -+ } -+ -+ -+ ////////////////////////////////////////////////////////////////////////// -+ // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector, -+ // packing the active mask bits -+ // ex. bitmask 0011 -> (0, 1, 0, 0) -+ // bitmask 1000 -> (3, 0, 0, 0) -+ // bitmask 1100 -> (2, 3, 0, 0) -+ Value* PackMask(uint32_t bitmask) -+ { -+ std::vector indices(4, C(0)); -+ DWORD index; -+ uint32_t elem = 0; -+ while (_BitScanForward(&index, bitmask)) -+ { -+ indices[elem++] = C((int)index); -+ bitmask &= ~(1 << index); -+ } -+ -+ return ConstantVector::get(indices); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ // @brief convert scalar bitmask to <4xfloat> bitmask -+ Value* ToMask(uint32_t bitmask) -+ { -+ std::vector indices; -+ for (uint32_t i = 0; i < 4; ++i) -+ { -+ if (bitmask & (1 << i)) -+ { -+ indices.push_back(C(-1.0f)); -+ } -+ else -+ { -+ indices.push_back(C(0.0f)); -+ } -+ } -+ return ConstantVector::get(indices); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ // @brief processes a single decl from the streamout stream. Reads 4 components from the input -+ // stream and writes N components to the output buffer given the componentMask or if -+ // a hole, just increments the buffer pointer -+ // @param pStream - pointer to current attribute -+ // @param pOutBuffers - pointers to the current location of each output buffer -+ // @param decl - input decl -+ void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl) -+ { -+ // @todo add this to x86 macros -+ Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps); -+ -+ uint32_t numComponents = _mm_popcnt_u32(decl.componentMask); -+ uint32_t packedMask = (1 << numComponents) - 1; -+ if (!decl.hole) -+ { -+ // increment stream pointer to correct slot -+ Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot)); -+ -+ // load 4 components from stream -+ Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4); -+ Type* simd4PtrTy = PointerType::get(simd4Ty, 0); -+ pAttrib = BITCAST(pAttrib, simd4PtrTy); -+ Value *vattrib = LOAD(pAttrib); -+ -+ // shuffle/pack enabled components -+ Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask)); -+ -+ // store to output buffer -+ // cast SO buffer to i8*, needed by maskstore -+ Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0)); -+ -+ // cast input to <4xfloat> -+ Value* src = BITCAST(vpackedAttrib, simd4Ty); -+ CALL3(maskStore, pOut, ToMask(packedMask), src); -+ } -+ -+ // increment SO buffer -+ pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents)); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ // @brief builds a single vertex worth of data for the given stream -+ // @param streamState - state for this stream -+ // @param pCurVertex - pointer to src stream vertex data -+ // @param pOutBuffer - pointers to up to 4 SO buffers -+ void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4]) -+ { -+ for (uint32_t d = 0; d < streamState.numDecls; ++d) -+ { -+ const STREAMOUT_DECL& decl = streamState.decl[d]; -+ buildDecl(pCurVertex, pOutBuffer, decl); -+ } -+ } -+ -+ void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc) -+ { -+ // get list of active SO buffers -+ std::unordered_set activeSOBuffers; -+ for (uint32_t d = 0; d < streamState.numDecls; ++d) -+ { -+ const STREAMOUT_DECL& decl = streamState.decl[d]; -+ activeSOBuffers.insert(decl.bufferIndex); -+ } -+ -+ // always increment numPrimStorageNeeded -+ Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); -+ numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1)); -+ STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); -+ -+ // check OOB on active SO buffers. If any buffer is out of bound, don't write -+ // the primitive to any buffer -+ Value* oobMask = C(false); -+ for (uint32_t buffer : activeSOBuffers) -+ { -+ oobMask = OR(oobMask, oob(state, pSoCtx, buffer)); -+ } -+ -+ BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc); -+ -+ // early out if OOB -+ COND_BR(oobMask, returnBB, validBB); -+ -+ IRB()->SetInsertPoint(validBB); -+ -+ Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); -+ numPrimsWritten = ADD(numPrimsWritten, C(1)); -+ STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); -+ -+ // compute start pointer for each output buffer -+ Value* pOutBuffer[4]; -+ Value* pOutBufferStartVertex[4]; -+ Value* outBufferPitch[4]; -+ for (uint32_t b: activeSOBuffers) -+ { -+ Value* pBuf = getSOBuffer(pSoCtx, b); -+ Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer }); -+ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); -+ pOutBuffer[b] = GEP(pData, streamOffset); -+ pOutBufferStartVertex[b] = pOutBuffer[b]; -+ -+ outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); -+ } -+ -+ // loop over the vertices of the prim -+ Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData }); -+ for (uint32_t v = 0; v < state.numVertsPerPrim; ++v) -+ { -+ buildVertex(streamState, pStreamData, pOutBuffer); -+ -+ // increment stream and output buffer pointers -+ // stream verts are always 32*4 dwords apart -+ pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4)); -+ -+ // output buffers offset using pitch in buffer state -+ for (uint32_t b : activeSOBuffers) -+ { -+ pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]); -+ pOutBuffer[b] = pOutBufferStartVertex[b]; -+ } -+ } -+ -+ // update each active buffer's streamOffset -+ for (uint32_t b : activeSOBuffers) -+ { -+ Value* pBuf = getSOBuffer(pSoCtx, b); -+ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); -+ streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b])); -+ STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); -+ } -+ } -+ -+ Function* Create(const STREAMOUT_COMPILE_STATE& state) -+ { -+ static std::size_t soNum = 0; -+ -+ std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); -+ fnName << soNum++; -+ -+ // SO function signature -+ // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*) -+ -+ std::vector args{ -+ PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* -+ }; -+ -+ FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); -+ Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); -+ -+ // create return basic block -+ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc); -+ BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc); -+ -+ IRB()->SetInsertPoint(entry); -+ -+ // arguments -+ auto argitr = soFunc->getArgumentList().begin(); -+ Value* pSoCtx = argitr++; -+ pSoCtx->setName("pSoCtx"); -+ -+ const STREAMOUT_STREAM& streamState = state.stream; -+ buildStream(state, streamState, pSoCtx, returnBB, soFunc); -+ -+ BR(returnBB); -+ -+ IRB()->SetInsertPoint(returnBB); -+ RET_VOID(); -+ -+ JitManager::DumpToFile(soFunc, "SoFunc"); -+ -+ FunctionPassManager passes(JM()->mpCurrentModule); -+ passes.add(createBreakCriticalEdgesPass()); -+ passes.add(createCFGSimplificationPass()); -+ passes.add(createEarlyCSEPass()); -+ passes.add(createPromoteMemoryToRegisterPass()); -+ passes.add(createCFGSimplificationPass()); -+ passes.add(createEarlyCSEPass()); -+ passes.add(createInstructionCombiningPass()); -+ passes.add(createInstructionSimplifierPass()); -+ passes.add(createConstantPropagationPass()); -+ passes.add(createSCCPPass()); -+ passes.add(createAggressiveDCEPass()); -+ -+ passes.run(*soFunc); -+ -+ JitManager::DumpToFile(soFunc, "SoFunc_optimized"); -+ -+ return soFunc; -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JITs from streamout shader IR -+/// @param hJitMgr - JitManager handle -+/// @param func - LLVM function IR -+/// @return PFN_SO_FUNC - pointer to SOS function -+PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc) -+{ -+ const llvm::Function *func = (const llvm::Function*)hFunc; -+ JitManager* pJitMgr = reinterpret_cast(hJitMgr); -+ PFN_SO_FUNC pfnStreamOut; -+ pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); -+ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module -+ pJitMgr->mIsModuleFinalized = true; -+ -+ return pfnStreamOut; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief JIT compiles streamout shader -+/// @param hJitMgr - JitManager handle -+/// @param state - SO state to build function from -+extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state) -+{ -+ JitManager* pJitMgr = reinterpret_cast(hJitMgr); -+ -+ pJitMgr->SetupNewModule(); -+ -+ StreamOutJit theJit(pJitMgr); -+ HANDLE hFunc = theJit.Create(state); -+ -+ return JitStreamoutFunc(hJitMgr, hFunc); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h -new file mode 100644 -index 0000000..4372a9d ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h -@@ -0,0 +1,91 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file streamout_jit.h -+* -+* @brief Definition of the streamout jitter -+* -+* Notes: -+* -+******************************************************************************/ -+#pragma once -+ -+#include "common/formats.h" -+#include "core/state.h" -+ -+////////////////////////////////////////////////////////////////////////// -+/// STREAMOUT_DECL - Stream decl -+////////////////////////////////////////////////////////////////////////// -+struct STREAMOUT_DECL -+{ -+ // Buffer that stream maps to. -+ DWORD bufferIndex; -+ -+ // attribute to stream -+ uint32_t attribSlot; -+ -+ // attribute component mask -+ uint32_t componentMask; -+ -+ // indicates this decl is a hole -+ bool hole; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// STREAMOUT_STREAM - Stream decls -+////////////////////////////////////////////////////////////////////////// -+struct STREAMOUT_STREAM -+{ -+ // numnber of decls for this stream -+ uint32_t numDecls; -+ -+ // array of numDecls decls -+ STREAMOUT_DECL decl[128]; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// State required for streamout jit -+////////////////////////////////////////////////////////////////////////// -+struct STREAMOUT_COMPILE_STATE -+{ -+ // number of verts per primitive -+ uint32_t numVertsPerPrim; -+ -+ // stream decls -+ STREAMOUT_STREAM stream; -+ -+ bool operator==(const STREAMOUT_COMPILE_STATE &other) const -+ { -+ if (numVertsPerPrim != other.numVertsPerPrim) return false; -+ if (stream.numDecls != other.stream.numDecls) return false; -+ -+ for (uint32_t i = 0; i < stream.numDecls; ++i) -+ { -+ if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) return false; -+ if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) return false; -+ if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) return false; -+ if (stream.decl[i].hole != other.stream.decl[i].hole) return false; -+ } -+ -+ return true; -+ } -+}; -diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp -new file mode 100644 -index 0000000..ad73cd8 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp -@@ -0,0 +1,287 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file ClearTile.cpp -+* -+* @brief Functionality for ClearTile. StoreHotTileClear clears a single macro -+* tile in the destination. -+* -+******************************************************************************/ -+#include "common/os.h" -+#include "core/context.h" -+#include "common/formats.h" -+#include "memory/TilingFunctions.h" -+#include "memory/tilingtraits.h" -+#include "memory/Convert.h" -+ -+typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT); -+ -+////////////////////////////////////////////////////////////////////////// -+/// Clear Raster Tile Function Tables. -+////////////////////////////////////////////////////////////////////////// -+static PFN_STORE_TILES_CLEAR sStoreTilesClearColorTable[NUM_SWR_FORMATS]; -+ -+static PFN_STORE_TILES_CLEAR sStoreTilesClearDepthTable[NUM_SWR_FORMATS]; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StoreRasterTileClear -+////////////////////////////////////////////////////////////////////////// -+template -+struct StoreRasterTileClear -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pColor - Pointer to clear color. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void StoreClear( -+ const BYTE* dstFormattedColor, -+ UINT dstBytesPerPixel, -+ SWR_SURFACE_STATE* pDstSurface, -+ UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile. -+ { -+ // Compute destination address for raster tile. -+ BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress + -+ (y * pDstSurface->pitch) + (x * dstBytesPerPixel); -+ -+ // start of first row -+ BYTE* pDst = pDstTile; -+ UINT dstBytesPerRow = 0; -+ -+ // For each raster tile pixel in row 0 (rx, 0) -+ for (UINT rx = 0; (rx < KNOB_TILE_X_DIM) && ((x + rx) < pDstSurface->width); ++rx) -+ { -+ memcpy(pDst, dstFormattedColor, dstBytesPerPixel); -+ -+ // Increment pointer to next pixel in row. -+ pDst += dstBytesPerPixel; -+ dstBytesPerRow += dstBytesPerPixel; -+ } -+ -+ // start of second row -+ pDst = pDstTile + pDstSurface->pitch; -+ -+ // For each remaining row in the rest of the raster tile -+ for (UINT ry = 1; (ry < KNOB_TILE_Y_DIM) && ((y + ry) < pDstSurface->height); ++ry) -+ { -+ // copy row -+ memcpy(pDst, pDstTile, dstBytesPerRow); -+ -+ // Increment pointer to first pixel in next row. -+ pDst += pDstSurface->pitch; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StoreMacroTileClear - Stores a macro tile clear to its raster tiles. -+////////////////////////////////////////////////////////////////////////// -+template -+struct StoreMacroTileClear -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores a macrotile to the destination surface. -+ /// @param pColor - Pointer to color to write to pixels. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to macro tile -+ static void StoreClear( -+ const FLOAT *pColor, -+ SWR_SURFACE_STATE* pDstSurface, -+ UINT x, UINT y) -+ { -+ UINT dstBytesPerPixel = (FormatTraits::bpp / 8); -+ -+ BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel -+ -+ FLOAT srcColor[4]; -+ -+ for (UINT comp = 0; comp < FormatTraits::numComps; ++comp) -+ { -+ srcColor[comp] = pColor[FormatTraits::swizzle(comp)]; -+ } -+ -+ // using this helper function, but the Tiling Traits is unused inside it so just using a dummy value -+ ConvertPixelFromFloat(dstFormattedColor, srcColor); -+ -+ // Store each raster tile from the hot tile to the destination surface. -+ // TODO: Put in check for partial coverage on x/y -- SWR_ASSERT if it happens. -+ // Intent is for this function to only handle full tiles. -+ for (UINT row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) -+ { -+ for (UINT col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) -+ { -+ StoreRasterTileClear::StoreClear(dstFormattedColor, dstBytesPerPixel, pDstSurface, (x + col), (y + row)); -+ } -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Writes clear color to every pixel of a render surface -+/// @param hPrivateContext - Handle to private DC -+/// @param renderTargetIndex - Index to destination render target -+/// @param x, y - Coordinates to raster tile. -+/// @param pClearColor - Pointer to clear color -+void StoreHotTileClear( -+ SWR_SURFACE_STATE *pDstSurface, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ UINT x, -+ UINT y, -+ const float* pClearColor) -+{ -+ PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL; -+ -+ SWR_ASSERT(renderTargetIndex != SWR_ATTACHMENT_STENCIL); ///@todo Not supported yet. -+ -+ if (renderTargetIndex != SWR_ATTACHMENT_DEPTH) -+ { -+ pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format]; -+ } -+ else -+ { -+ pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format]; -+ } -+ -+ SWR_ASSERT(pfnStoreTilesClear != NULL); -+ -+ // Store a macro tile. -+ /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress. -+ if (pfnStoreTilesClear != NULL) -+ { -+ pfnStoreTilesClear(pClearColor, pDstSurface, x, y); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. -+#define INIT_STORE_TILES_CLEAR_COLOR_TABLE() \ -+ memset(sStoreTilesClearColorTable, 0, sizeof(sStoreTilesClearColorTable)); \ -+ \ -+ sStoreTilesClearColorTable[R32G32B32A32_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32G32B32A32_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32G32B32A32_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32G32B32X32_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32G32B32_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32G32B32_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32G32B32_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16A16_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16A16_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16A16_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16A16_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16A16_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32G32_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32G32_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32G32_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16X16_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16X16_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B8G8R8A8_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B8G8R8A8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R10G10B10A2_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R10G10B10A2_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R10G10B10A2_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8A8_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8A8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8A8_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8A8_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8A8_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B10G10R10A2_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B10G10R10A2_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R11G11B10_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R32_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[A32_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B8G8R8X8_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B8G8R8X8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8X8_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8X8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B10G10R10X2_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B5G6R5_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B5G6R5_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B5G5R5A1_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B5G5R5A1_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B4G4R4A4_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B4G4R4A4_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[A16_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[A16_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B5G5R5X1_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B5G5R5X1_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[A8_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC1_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC2_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC3_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC4_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC5_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC1_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC2_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC3_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC4_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[BC5_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16_UNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R16G16B16_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R10G10B10A2_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R10G10B10A2_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B10G10R10A2_SNORM] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B10G10R10A2_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[B10G10R10A2_SINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8_UINT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearColorTable[R8G8B8_SINT] = StoreMacroTileClear::StoreClear; \ -+ -+////////////////////////////////////////////////////////////////////////// -+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. -+#define INIT_STORE_TILES_CLEAR_DEPTH_TABLE() \ -+ memset(sStoreTilesClearDepthTable, 0, sizeof(sStoreTilesClearDepthTable)); \ -+ \ -+ sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear::StoreClear; \ -+ sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear::StoreClear; \ -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Sets up tables for ClearTile -+void InitSimClearTilesTable() -+{ -+ INIT_STORE_TILES_CLEAR_COLOR_TABLE(); -+ INIT_STORE_TILES_CLEAR_DEPTH_TABLE(); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h -new file mode 100644 -index 0000000..0f9e0ad ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h -@@ -0,0 +1,698 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file Convert.h -+* -+* @brief Conversion utility functions -+* -+******************************************************************************/ -+#pragma once -+ -+#if defined(_WIN32) -+// disable "potential divide by 0" -+#pragma warning(disable: 4723) -+#endif -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision -+/// float -+/// @param val - 16-bit float -+/// @todo Maybe move this outside of this file into a header? -+static float ConvertSmallFloatTo32(UINT val) -+{ -+ UINT result; -+ if ((val & 0x7fff) == 0) -+ { -+ result = ((uint32_t)(val & 0x8000)) << 16; -+ } -+ else if ((val & 0x7c00) == 0x7c00) -+ { -+ result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; -+ result |= ((uint32_t)val & 0x8000) << 16; -+ } -+ else -+ { -+ uint32_t sign = (val & 0x8000) << 16; -+ uint32_t mant = (val & 0x3ff) << 13; -+ uint32_t exp = (val >> 10) & 0x1f; -+ if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals -+ { -+ mant <<= 1; -+ while (mant < (0x400 << 13)) -+ { -+ exp--; -+ mant <<= 1; -+ } -+ mant &= (0x3ff << 13); -+ } -+ exp = ((exp - 15 + 127) & 0xff) << 23; -+ result = sign | exp | mant; -+ } -+ -+ return *(float*)&result; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Convert an IEEE 754 32-bit single precision float to an -+/// unsigned small float with 5 exponent bits and a variable -+/// number of mantissa bits. -+/// @param val - 32-bit float -+/// @todo Maybe move this outside of this file into a header? -+template -+static UINT Convert32ToSmallFloat(float val) -+{ -+ uint32_t sign, exp, mant; -+ uint32_t roundBits; -+ -+ // Extract the sign, exponent, and mantissa -+ UINT uf = *(UINT*)&val; -+ -+ sign = (uf & 0x80000000) >> 31; -+ exp = (uf & 0x7F800000) >> 23; -+ mant = uf & 0x007FFFFF; -+ -+ // 10/11 bit floats are unsigned. Negative values are clamped to 0. -+ if (sign != 0) -+ { -+ exp = mant = 0; -+ } -+ // Check for out of range -+ else if ((exp == 0xFF) && (mant != 0)) // NaN -+ { -+ exp = 0x1F; -+ mant = 1 << numMantissaBits; -+ } -+ else if ((exp == 0xFF) && (mant == 0)) // INF -+ { -+ exp = 0x1F; -+ mant = 0; -+ } -+ else if (exp > (0x70 + 0x1E)) // Too big to represent -+ { -+ exp = 0x1Eu; -+ mant = (1 << numMantissaBits) - 1; // 0x3F for 6 bit mantissa. -+ } -+ else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm -+ { -+ mant |= 0x00800000; -+ for (; exp <= 0x70; mant >>= 1, exp++) -+ ; -+ exp = 0; -+ mant = mant >> (23 - numMantissaBits); -+ } -+ else if (exp < 0x66) // Too small to represent -> Zero -+ { -+ exp = 0; -+ mant = 0; -+ } -+ else -+ { -+ // Saves bits that will be shifted off for rounding -+ roundBits = mant & 0x1FFFu; -+ // convert exponent and mantissa to 16 bit format -+ exp = exp - 0x70u; -+ mant = mant >> (23 - numMantissaBits); -+ -+ // Essentially RTZ, but round up if off by only 1 lsb -+ if (roundBits == 0x1FFFu) -+ { -+ mant++; -+ // check for overflow -+ if ((mant & (0x3 << numMantissaBits)) != 0) // 0x60 = 0x3 << (num Mantissa Bits) -+ exp++; -+ // make sure only the needed bits are used -+ mant &= (1 << numMantissaBits) - 1; -+ } -+ } -+ -+ UINT tmpVal = (exp << numMantissaBits) | mant; -+ return tmpVal; -+} -+ -+#if KNOB_ARCH == KNOB_ARCH_AVX -+////////////////////////////////////////////////////////////////////////// -+/// @brief Convert an IEEE 754 32-bit single precision float to an -+/// 16 bit float with 5 exponent bits and a variable -+/// number of mantissa bits. -+/// @param val - 32-bit float -+/// @todo Maybe move this outside of this file into a header? -+static uint16_t Convert32To16Float(float val) -+{ -+ uint32_t sign, exp, mant; -+ uint32_t roundBits; -+ -+ // Extract the sign, exponent, and mantissa -+ uint32_t uf = *(uint32_t*)&val; -+ sign = (uf & 0x80000000) >> 31; -+ exp = (uf & 0x7F800000) >> 23; -+ mant = uf & 0x007FFFFF; -+ -+ // Check for out of range -+ if (std::isnan(val)) -+ { -+ exp = 0x1F; -+ mant = 0x200; -+ sign = 1; // set the sign bit for NANs -+ } -+ else if (std::isinf(val)) -+ { -+ exp = 0x1f; -+ mant = 0x0; -+ } -+ else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value -+ { -+ exp = 0x1E; -+ mant = 0x3FF; -+ } -+ else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm -+ { -+ mant |= 0x00800000; -+ for (; exp <= 0x70; mant >>= 1, exp++) -+ ; -+ exp = 0; -+ mant = mant >> 13; -+ } -+ else if (exp < 0x66) // Too small to represent -> Zero -+ { -+ exp = 0; -+ mant = 0; -+ } -+ else -+ { -+ // Saves bits that will be shifted off for rounding -+ roundBits = mant & 0x1FFFu; -+ // convert exponent and mantissa to 16 bit format -+ exp = exp - 0x70; -+ mant = mant >> 13; -+ -+ // Essentially RTZ, but round up if off by only 1 lsb -+ if (roundBits == 0x1FFFu) -+ { -+ mant++; -+ // check for overflow -+ if ((mant & 0xC00u) != 0) -+ exp++; -+ // make sure only the needed bits are used -+ mant &= 0x3FF; -+ } -+ } -+ -+ uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; -+ return (uint16_t)tmpVal; -+} -+#endif -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Retrieve color from hot tile source which is always float. -+/// @param pDstPixel - Pointer to destination pixel. -+/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest). -+template -+static void ConvertPixelFromFloat( -+ BYTE* pDstPixel, -+ const float srcPixel[4]) -+{ -+ UINT outColor[4]; // typeless bits -+ -+ // Store component -+ for (UINT comp = 0; comp < FormatTraits::numComps; ++comp) -+ { -+ SWR_TYPE type = FormatTraits::GetType(comp); -+ -+ float src = srcPixel[comp]; -+ -+ switch (type) -+ { -+ case SWR_TYPE_UNORM: -+ { -+ // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false. -+ src = (src != src) ? 0.0f : src; -+ -+ // Clamp [0, 1] -+ src = std::max(src, 0.0f); -+ src = std::min(src, 1.0f); -+ -+ // SRGB -+ if (FormatTraits::isSRGB && comp != 3) -+ { -+ src = (src <= 0.0031308f) ? (12.92f * src) : (1.055f * powf(src, (1.0f / 2.4f)) - 0.055f); -+ } -+ -+ // Float scale to integer scale. -+ UINT scale = (1 << FormatTraits::GetBPC(comp)) - 1; -+ src = (float)scale * src; -+ src = roundf(src); -+ outColor[comp] = (UINT)src; // Drop fractional part. -+ break; -+ } -+ case SWR_TYPE_SNORM: -+ { -+ SWR_ASSERT(!FormatTraits::isSRGB); -+ -+ // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false. -+ src = (src != src) ? 0.0f : src; -+ -+ // Clamp [-1, 1] -+ src = std::max(src, -1.0f); -+ src = std::min(src, 1.0f); -+ -+ // Float scale to integer scale. -+ UINT scale = (1 << (FormatTraits::GetBPC(comp) - 1)) - 1; -+ src = (float)scale * src; -+ -+ // Round -+ src += (src >= 0) ? 0.5f : -0.5f; -+ -+ INT out = (INT)src; -+ -+ outColor[comp] = *(UINT*)&out; -+ -+ break; -+ } -+ case SWR_TYPE_UINT: -+ { -+ ///@note The *(UINT*)& is currently necessary as the hot tile appears to always be float. -+ // However, the number in the hot tile should be unsigned integer. So doing this -+ // to preserve bits intead of doing a float -> integer conversion. -+ if (FormatTraits::GetBPC(comp) == 32) -+ { -+ outColor[comp] = *(UINT*)&src; -+ } -+ else -+ { -+ outColor[comp] = *(UINT*)&src; -+ UINT max = (1 << FormatTraits::GetBPC(comp)) - 1; // 2^numBits - 1 -+ -+ outColor[comp] = std::min(max, outColor[comp]); -+ } -+ break; -+ } -+ case SWR_TYPE_SINT: -+ { -+ if (FormatTraits::GetBPC(comp) == 32) -+ { -+ outColor[comp] = *(UINT*)&src; -+ } -+ else -+ { -+ INT out = *(INT*)&src; // Hot tile format is SINT? -+ INT max = (1 << (FormatTraits::GetBPC(comp) - 1)) - 1; -+ INT min = -1 - max; -+ -+ ///@note The output is unsigned integer (bag of bits) and so performing -+ // the clamping here based on range of output component. Also, manually adding -+ // the sign bit in the appropriate spot. Maybe a better way? -+ out = std::max(out, min); -+ out = std::min(out, max); -+ -+ outColor[comp] = *(UINT*)&out; -+ } -+ break; -+ } -+ case SWR_TYPE_FLOAT: -+ { -+ if (FormatTraits::GetBPC(comp) == 16) -+ { -+ // Convert from 32-bit float to 16-bit float using _mm_cvtps_ph -+ // @todo 16bit float instruction support is orthogonal to avx support. need to -+ // add check for F16C support instead. -+#if KNOB_ARCH == KNOB_ARCH_AVX2 -+ __m128 src128 = _mm_set1_ps(src); -+ __m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC); -+ UINT value = _mm_extract_epi16(srci128, 0); -+#else -+ UINT value = Convert32To16Float(src); -+#endif -+ -+ outColor[comp] = value; -+ } -+ else if (FormatTraits::GetBPC(comp) == 11) -+ { -+ outColor[comp] = Convert32ToSmallFloat<6>(src); -+ } -+ else if (FormatTraits::GetBPC(comp) == 10) -+ { -+ outColor[comp] = Convert32ToSmallFloat<5>(src); -+ } -+ else -+ { -+ outColor[comp] = *(UINT*)&src; -+ } -+ -+ break; -+ } -+ default: -+ SWR_ASSERT(0); -+ break; -+ } -+ } -+ -+ typename FormatTraits::FormatT* pPixel = (typename FormatTraits::FormatT*)pDstPixel; -+ -+ switch (FormatTraits::numComps) -+ { -+ case 4: -+ pPixel->a = outColor[3]; -+ case 3: -+ pPixel->b = outColor[2]; -+ case 2: -+ pPixel->g = outColor[1]; -+ case 1: -+ pPixel->r = outColor[0]; -+ break; -+ default: -+ SWR_ASSERT(0); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Convert pixel in any format to float32 -+/// @param pDstPixel - Pointer to destination pixel. -+/// @param srcPixel - Pointer to source pixel -+template -+INLINE static void ConvertPixelToFloat( -+ float dstPixel[4], -+ const BYTE* pSrc) -+{ -+ UINT srcColor[4]; // typeless bits -+ -+ // unpack src pixel -+ typename FormatTraits::FormatT* pPixel = (typename FormatTraits::FormatT*)pSrc; -+ -+ // apply format defaults -+ for (uint32_t comp = 0; comp < 4; ++comp) -+ { -+ uint32_t def = FormatTraits::GetDefault(comp); -+ dstPixel[comp] = *(float*)&def; -+ } -+ -+ // load format data -+ switch (FormatTraits::numComps) -+ { -+ case 4: -+ srcColor[3] = pPixel->a; -+ case 3: -+ srcColor[2] = pPixel->b; -+ case 2: -+ srcColor[1] = pPixel->g; -+ case 1: -+ srcColor[0] = pPixel->r; -+ break; -+ default: -+ SWR_ASSERT(0); -+ } -+ -+ // Convert components -+ for (UINT comp = 0; comp < FormatTraits::numComps; ++comp) -+ { -+ SWR_TYPE type = FormatTraits::GetType(comp); -+ -+ UINT src = srcColor[comp]; -+ -+ switch (type) -+ { -+ case SWR_TYPE_UNORM: -+ { -+ float dst; -+ if (FormatTraits::isSRGB && comp != 3) -+ { -+ dst = *(float*)&srgb8Table[src]; -+ } -+ else -+ { -+ // component sizes > 16 must use fp divide to maintain ulp requirements -+ if (FormatTraits::GetBPC(comp) > 16) -+ { -+ dst = (float)src / (float)((1 << FormatTraits::GetBPC(comp)) - 1); -+ } -+ else -+ { -+ const float scale = (1.0f / (float)((1 << FormatTraits::GetBPC(comp)) - 1)); -+ dst = (float)src * scale; -+ } -+ } -+ dstPixel[FormatTraits::swizzle(comp)] = dst; -+ break; -+ } -+ case SWR_TYPE_SNORM: -+ { -+ SWR_ASSERT(!FormatTraits::isSRGB); -+ -+ float dst; -+ if (src == 0x10) -+ { -+ dst = -1.0f; -+ } -+ else -+ { -+ switch (FormatTraits::GetBPC(comp)) -+ { -+ case 8: -+ dst = (float)((int8_t)src); -+ break; -+ case 16: -+ dst = (float)((int16_t)src); -+ break; -+ case 32: -+ dst = (float)((int32_t)src); -+ break; -+ default: -+ assert(0 && "attempted to load from SNORM with unsupported bpc"); -+ dst = 0.0f; -+ break; -+ } -+ dst = dst * (1.0f / ((1 << (FormatTraits::GetBPC(comp) - 1)) - 1)); -+ } -+ dstPixel[FormatTraits::swizzle(comp)] = dst; -+ break; -+ } -+ case SWR_TYPE_UINT: -+ { -+ UINT dst = (UINT)src; -+ dstPixel[FormatTraits::swizzle(comp)] = *(float*)&dst; -+ break; -+ } -+ case SWR_TYPE_SINT: -+ { -+ int dst; -+ switch (FormatTraits::GetBPC(comp)) -+ { -+ case 8: -+ dst = (int8_t)src; -+ break; -+ case 16: -+ dst = (int16_t)src; -+ break; -+ case 32: -+ dst = (int32_t)src; -+ break; -+ default: -+ assert(0 && "attempted to load from SINT with unsupported bpc"); -+ dst = 0; -+ break; -+ } -+ dstPixel[FormatTraits::swizzle(comp)] = *(float*)&dst; -+ break; -+ } -+ case SWR_TYPE_FLOAT: -+ { -+ float dst; -+ if (FormatTraits::GetBPC(comp) == 16) -+ { -+#if KNOB_ARCH == KNOB_ARCH_AVX2 -+ // Convert from 16-bit float to 32-bit float using _mm_cvtph_ps -+ // @todo 16bit float instruction support is orthogonal to avx support. need to -+ // add check for F16C support instead. -+ __m128i src128 = _mm_set1_epi32(src); -+ __m128 res = _mm_cvtph_ps(src128); -+ _mm_store_ss(&dst, res); -+#else -+ dst = ConvertSmallFloatTo32(src); -+#endif -+ } -+ else if (FormatTraits::GetBPC(comp) == 11) -+ { -+ dst = ConvertSmallFloatTo32(src << 4); -+ } -+ else if (FormatTraits::GetBPC(comp) == 10) -+ { -+ dst = ConvertSmallFloatTo32(src << 5); -+ } -+ else -+ { -+ dst = *(float*)&src; -+ } -+ -+ dstPixel[FormatTraits::swizzle(comp)] = *(float*)&dst; -+ break; -+ } -+ default: -+ SWR_ASSERT(0); -+ break; -+ } -+ } -+} -+ -+// non-templated version of conversion functions -+INLINE static void ConvertPixelFromFloat( -+ SWR_FORMAT format, -+ uint8_t* pDst, -+ const float srcPixel[4]) -+{ -+ switch (format) -+ { -+ case R32G32B32A32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32A32_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32A32_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32X32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32A32_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32A32_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32B32_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16A16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16A16_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16A16_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16A16_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16A16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32_FLOAT_X8X24_TYPELESS: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16X16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16X16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16A16_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16A16_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32G32_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32_FLOAT_X8X24_TYPELESS_LD: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B8G8R8A8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B8G8R8A8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R10G10B10A2_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R10G10B10A2_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R10G10B10A2_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8A8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8A8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8A8_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8A8_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8A8_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B10G10R10A2_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B10G10R10A2_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R11G11B10_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R24_UNORM_X8_TYPELESS: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R24_UNORM_X8_TYPELESS_LD: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case A32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B8G8R8X8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B8G8R8X8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8X8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8X8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R9G9B9E5_SHAREDEXP: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B10G10R10X2_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R10G10B10X2_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8A8_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8A8_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R32_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B5G6R5_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B5G6R5_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B5G5R5A1_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B5G5R5A1_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B4G4R4A4_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B4G4R4A4_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case A16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case A16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B5G5R5X1_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B5G5R5X1_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case A8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case YCRCB_SWAPUVY: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC1_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC2_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC3_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC4_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC5_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC1_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC2_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC3_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case YCRCB_SWAPUV: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC4_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC5_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC7_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case BC7_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R16G16B16_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R10G10B10A2_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R10G10B10A2_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R10G10B10A2_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R10G10B10A2_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B10G10R10A2_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B10G10R10A2_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B10G10R10A2_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B10G10R10A2_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case B10G10R10A2_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ case R8G8B8_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; -+ default: -+ break; -+ } -+} -+ -+ -diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp -new file mode 100644 -index 0000000..49893e8 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp -@@ -0,0 +1,382 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file LoadTile.cpp -+* -+* @brief Functionality for Load -+* -+******************************************************************************/ -+#include "common/os.h" -+#include "common/formats.h" -+#include "core/context.h" -+#include "core/rdtsc_core.h" -+#include "memory/TilingFunctions.h" -+#include "memory/tilingtraits.h" -+#include "memory/Convert.h" -+ -+typedef void(*PFN_LOAD_TILES)(SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t); -+ -+////////////////////////////////////////////////////////////////////////// -+/// Load Raster Tile Function Tables. -+////////////////////////////////////////////////////////////////////////// -+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; -+static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; -+ -+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; -+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS]; -+ -+static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; -+ -+////////////////////////////////////////////////////////////////////////// -+/// LoadRasterTile -+////////////////////////////////////////////////////////////////////////// -+template -+struct LoadRasterTile -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Retrieve color from hot tile source which is always float. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param x, y - Coordinates to raster tile. -+ /// @param output - output color -+ INLINE static void SetSwizzledDstColor( -+ const float srcColor[4], -+ uint32_t x, uint32_t y, -+ uint8_t* pDst) -+ { -+ typedef SimdTile SimdT; -+ -+ SimdT* pDstSimdTiles = (SimdT*)pDst; -+ -+ // Compute which simd tile we're accessing within 8x8 tile. -+ // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. -+ uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); -+ -+ SimdT* pSimdTile = &pDstSimdTiles[simdIndex]; -+ -+ uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); -+ -+ pSimdTile->SetSwizzledColor(simdOffset, srcColor); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Loads an 8x8 raster tile from the src surface. -+ /// @param pSrcSurface - Src surface state -+ /// @param pDst - Destination hot tile pointer -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Load( -+ SWR_SURFACE_STATE* pSrcSurface, -+ uint8_t* pDst, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. -+ { -+ uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod; -+ uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod; -+ -+ // For each raster tile pixel (rx, ry) -+ for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) -+ { -+ for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) -+ { -+ if (((x + rx) < lodWidth) && -+ ((y + ry) < lodHeight)) -+ { -+ uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex, -+ pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum, -+ pSrcSurface->lod, pSrcSurface); -+ -+ float srcColor[4]; -+ ConvertPixelToFloat(srcColor, pSrc); -+ -+ // store pixel to hottile -+ SetSwizzledDstColor(srcColor, rx, ry, pDst); -+ } -+ } -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// LoadMacroTile - Loads a macro tile which consists of raster tiles. -+////////////////////////////////////////////////////////////////////////// -+template -+struct LoadMacroTile -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Load a macrotile to the destination surface. -+ /// @param pSrc - Pointer to macro tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to macro tile -+ static void Load( -+ SWR_SURFACE_STATE* pSrcSurface, -+ uint8_t *pDstHotTile, -+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) -+ { -+ // Load each raster tile from the hot tile to the destination surface. -+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) -+ { -+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) -+ { -+ for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++) -+ { -+ LoadRasterTile::Load(pSrcSurface, pDstHotTile, -+ (x + col), (y + row), sampleNum, renderTargetArrayIndex); -+ pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); -+ } -+ } -+ } -+ } -+}; -+ -+static void BUCKETS_START(UINT id) -+{ -+#ifdef KNOB_ENABLE_RDTSC -+ gBucketMgr.StartBucket(id); -+#endif -+} -+ -+static void BUCKETS_STOP(UINT id) -+{ -+#ifdef KNOB_ENABLE_RDTSC -+ gBucketMgr.StopBucket(id); -+#endif -+} -+ -+// on demand buckets for load tiles -+static std::vector sBuckets(NUM_SWR_FORMATS, -1); -+static std::mutex sBucketMutex; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Loads a full hottile from a render surface -+/// @param hPrivateContext - Handle to private DC -+/// @param dstFormat - Format for hot tile. -+/// @param renderTargetIndex - Index to src render target -+/// @param x, y - Coordinates to raster tile. -+/// @param pDstHotTile - Pointer to Hot Tile -+void LoadHotTile( -+ SWR_SURFACE_STATE *pSrcSurface, -+ SWR_FORMAT dstFormat, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, -+ uint8_t *pDstHotTile) -+{ -+ PFN_LOAD_TILES pfnLoadTiles = NULL; -+ -+ // don't need to load null surfaces -+ if (pSrcSurface->type == SURFACE_NULL) -+ { -+ return; -+ } -+ -+ if (renderTargetIndex < SWR_ATTACHMENT_DEPTH) -+ { -+ switch (pSrcSurface->tileMode) -+ { -+ case SWR_TILE_NONE: -+ pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_NONE[pSrcSurface->format]; -+ break; -+ case SWR_TILE_MODE_YMAJOR: -+ pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format]; -+ break; -+ case SWR_TILE_MODE_XMAJOR: -+ pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[pSrcSurface->format]; -+ break; -+ default: -+ SWR_ASSERT(0, "Unsupported tiling mode"); -+ break; -+ } -+ } -+ else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH) -+ { -+ // Currently depth can map to linear and tile-y. -+ switch (pSrcSurface->tileMode) -+ { -+ case SWR_TILE_NONE: -+ pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_NONE[pSrcSurface->format]; -+ break; -+ case SWR_TILE_MODE_YMAJOR: -+ pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format]; -+ break; -+ default: -+ SWR_ASSERT(0, "Unsupported tiling mode"); -+ break; -+ } -+ } -+ else -+ { -+ SWR_ASSERT(renderTargetIndex == SWR_ATTACHMENT_STENCIL); -+ SWR_ASSERT(pSrcSurface->format == R8_UINT); -+ switch (pSrcSurface->tileMode) -+ { -+ case SWR_TILE_NONE: -+ pfnLoadTiles = LoadMacroTile, R8_UINT, R8_UINT>::Load; -+ break; -+ case SWR_TILE_MODE_WMAJOR: -+ pfnLoadTiles = LoadMacroTile, R8_UINT, R8_UINT>::Load; -+ break; -+ default: -+ SWR_ASSERT(0, "Unsupported tiling mode"); -+ break; -+ } -+ } -+ -+ SWR_ASSERT(pfnLoadTiles != NULL); -+ -+ // Load a macro tile. -+#ifdef KNOB_ENABLE_RDTSC -+ if (sBuckets[pSrcSurface->format] == -1) -+ { -+ // guard sBuckets update since storetiles is called by multiple threads -+ sBucketMutex.lock(); -+ if (sBuckets[pSrcSurface->format] == -1) -+ { -+ const SWR_FORMAT_INFO& info = GetFormatInfo(pSrcSurface->format); -+ BUCKET_DESC desc{ info.name, "", false, 0xffffffff }; -+ sBuckets[pSrcSurface->format] = gBucketMgr.RegisterBucket(desc); -+ } -+ sBucketMutex.unlock(); -+ } -+#endif -+ -+ BUCKETS_START(sBuckets[pSrcSurface->format]); -+ pfnLoadTiles(pSrcSurface, pDstHotTile, x, y, renderTargetArrayIndex); -+ BUCKETS_STOP(sBuckets[pSrcSurface->format]); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables. -+#define INIT_LOAD_TILES_COLOR_TABLE(tilemode) \ -+ memset(sLoadTilesColorTable_##tilemode, 0, sizeof(sLoadTilesColorTable_##tilemode)); \ -+ \ -+ sLoadTilesColorTable_##tilemode[R32G32B32A32_FLOAT] = LoadMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32G32B32A32_SINT] = LoadMacroTile, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32G32B32A32_UINT] = LoadMacroTile, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32G32B32X32_FLOAT] = LoadMacroTile, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32G32B32_FLOAT] = LoadMacroTile, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32G32B32_SINT] = LoadMacroTile, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32G32B32_UINT] = LoadMacroTile, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16A16_UNORM] = LoadMacroTile, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16A16_SNORM] = LoadMacroTile, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16A16_SINT] = LoadMacroTile, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16A16_UINT] = LoadMacroTile, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16A16_FLOAT] = LoadMacroTile, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32G32_FLOAT] = LoadMacroTile, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32G32_SINT] = LoadMacroTile, R32G32_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32G32_UINT] = LoadMacroTile, R32G32_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16X16_UNORM] = LoadMacroTile, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16X16_FLOAT] = LoadMacroTile, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM] = LoadMacroTile, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM_SRGB] = LoadMacroTile, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM] = LoadMacroTile, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM_SRGB] = LoadMacroTile, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R10G10B10A2_UINT] = LoadMacroTile, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM] = LoadMacroTile, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM_SRGB] = LoadMacroTile, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8A8_SNORM] = LoadMacroTile, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8A8_SINT] = LoadMacroTile, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8A8_UINT] = LoadMacroTile, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16_UNORM] = LoadMacroTile, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16_SNORM] = LoadMacroTile, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16_SINT] = LoadMacroTile, R16G16_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16_UINT] = LoadMacroTile, R16G16_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16_FLOAT] = LoadMacroTile, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM] = LoadMacroTile, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM_SRGB] = LoadMacroTile, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R11G11B10_FLOAT] = LoadMacroTile, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32_SINT] = LoadMacroTile, R32_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32_UINT] = LoadMacroTile, R32_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R32_FLOAT] = LoadMacroTile, R32_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[A32_FLOAT] = LoadMacroTile, A32_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM] = LoadMacroTile, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM_SRGB] = LoadMacroTile, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM] = LoadMacroTile, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM_SRGB] = LoadMacroTile, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B10G10R10X2_UNORM] = LoadMacroTile, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B5G6R5_UNORM] = LoadMacroTile, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B5G6R5_UNORM_SRGB] = LoadMacroTile, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM] = LoadMacroTile, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM_SRGB] = LoadMacroTile, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM] = LoadMacroTile, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM_SRGB] = LoadMacroTile, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8_UNORM] = LoadMacroTile, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8_SNORM] = LoadMacroTile, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8_SINT] = LoadMacroTile, R8G8_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8_UINT] = LoadMacroTile, R8G8_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16_UNORM] = LoadMacroTile, R16_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16_SNORM] = LoadMacroTile, R16_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16_SINT] = LoadMacroTile, R16_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16_UINT] = LoadMacroTile, R16_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16_FLOAT] = LoadMacroTile, R16_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[A16_UNORM] = LoadMacroTile, A16_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[A16_FLOAT] = LoadMacroTile, A16_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM] = LoadMacroTile, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM_SRGB] = LoadMacroTile, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8_UNORM] = LoadMacroTile, R8_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8_SNORM] = LoadMacroTile, R8_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8_SINT] = LoadMacroTile, R8_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8_UINT] = LoadMacroTile, R8_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[A8_UNORM] = LoadMacroTile, A8_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC1_UNORM] = LoadMacroTile, BC1_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC2_UNORM] = LoadMacroTile, BC2_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC3_UNORM] = LoadMacroTile, BC3_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC4_UNORM] = LoadMacroTile, BC4_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC5_UNORM] = LoadMacroTile, BC5_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC1_UNORM_SRGB] = LoadMacroTile, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC2_UNORM_SRGB] = LoadMacroTile, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC3_UNORM_SRGB] = LoadMacroTile, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8_UNORM] = LoadMacroTile, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8_SNORM] = LoadMacroTile, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC4_SNORM] = LoadMacroTile, BC4_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[BC5_SNORM] = LoadMacroTile, BC5_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16_FLOAT] = LoadMacroTile, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16_UNORM] = LoadMacroTile, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16_SNORM] = LoadMacroTile, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8_UNORM_SRGB] = LoadMacroTile, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16_UINT] = LoadMacroTile, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R16G16B16_SINT] = LoadMacroTile, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R10G10B10A2_SNORM] = LoadMacroTile, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R10G10B10A2_SINT] = LoadMacroTile, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B10G10R10A2_SNORM] = LoadMacroTile, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B10G10R10A2_UINT] = LoadMacroTile, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[B10G10R10A2_SINT] = LoadMacroTile, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8_UINT] = LoadMacroTile, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; \ -+ sLoadTilesColorTable_##tilemode[R8G8B8_SINT] = LoadMacroTile, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; \ -+ -+////////////////////////////////////////////////////////////////////////// -+/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables. -+#define INIT_LOAD_TILES_DEPTH_TABLE(tilemode) \ -+ memset(sLoadTilesDepthTable_##tilemode, 0, sizeof(sLoadTilesDepthTable_##tilemode)); \ -+ \ -+ sLoadTilesDepthTable_##tilemode[R16_UNORM] = LoadMacroTile, R16_UNORM, R32_FLOAT>::Load; \ -+ sLoadTilesDepthTable_##tilemode[R32_FLOAT] = LoadMacroTile, R32_FLOAT, R32_FLOAT>::Load; \ -+ sLoadTilesDepthTable_##tilemode[R24_UNORM_X8_TYPELESS] = LoadMacroTile, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; \ -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Sets up tables for LoadTile -+void InitSimLoadTilesTable() -+{ -+ INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_NONE); -+ INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_NONE); -+ -+ INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_YMAJOR); -+ INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_XMAJOR); -+ -+ INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_MODE_YMAJOR); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp -new file mode 100644 -index 0000000..fbd76a3 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp -@@ -0,0 +1,1645 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file StoreTile.cpp -+* -+* @brief Functionality for Store. -+* -+******************************************************************************/ -+#include "common/os.h" -+#include "common/formats.h" -+#include "core/context.h" -+#include "core/rdtsc_core.h" -+#include "core/format_conversion.h" -+ -+#include "memory/TilingFunctions.h" -+#include "memory/tilingtraits.h" -+#include "memory/Convert.h" -+#include "core/multisample.h" -+ -+#include -+#include -+ -+typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); -+ -+////////////////////////////////////////////////////////////////////////// -+/// Store Raster Tile Function Tables. -+////////////////////////////////////////////////////////////////////////// -+static PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; -+static PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; -+static PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StorePixels -+/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -+/// @param ppDsts - Array of destination pointers. Each pointer is -+/// to a single row of at most 16B. -+/// @tparam NumDests - Number of destination pointers. Each pair of -+/// pointers is for a 16-byte column of two rows. -+////////////////////////////////////////////////////////////////////////// -+template -+struct StorePixels -+{ -+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StorePixels (32-bit pixel specialization) -+/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -+/// @param ppDsts - Array of destination pointers. Each pointer is -+/// to a single row of at most 16B. -+/// @tparam NumDests - Number of destination pointers. Each pair of -+/// pointers is for a 16-byte column of two rows. -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct StorePixels<8, 2> -+{ -+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) -+ { -+ // Each 4-pixel row is 4 bytes. -+ const uint16_t* pPixSrc = (const uint16_t*)pSrc; -+ -+ // Unswizzle from SWR-Z order -+ uint16_t* pRow = (uint16_t*)ppDsts[0]; -+ pRow[0] = pPixSrc[0]; -+ pRow[1] = pPixSrc[2]; -+ -+ pRow = (uint16_t*)ppDsts[1]; -+ pRow[0] = pPixSrc[1]; -+ pRow[1] = pPixSrc[3]; -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StorePixels (32-bit pixel specialization) -+/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -+/// @param ppDsts - Array of destination pointers. Each pointer is -+/// to a single row of at most 16B. -+/// @tparam NumDests - Number of destination pointers. Each pair of -+/// pointers is for a 16-byte column of two rows. -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct StorePixels<16, 2> -+{ -+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) -+ { -+ // Each 4-pixel row is 8 bytes. -+ const uint32_t* pPixSrc = (const uint32_t*)pSrc; -+ -+ // Unswizzle from SWR-Z order -+ uint32_t* pRow = (uint32_t*)ppDsts[0]; -+ pRow[0] = pPixSrc[0]; -+ pRow[1] = pPixSrc[2]; -+ -+ pRow = (uint32_t*)ppDsts[1]; -+ pRow[0] = pPixSrc[1]; -+ pRow[1] = pPixSrc[3]; -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StorePixels (32-bit pixel specialization) -+/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -+/// @param ppDsts - Array of destination pointers. Each pointer is -+/// to a single row of at most 16B. -+/// @tparam NumDests - Number of destination pointers. Each pair of -+/// pointers is for a 16-byte column of two rows. -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct StorePixels<32, 2> -+{ -+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) -+ { -+ // Each 4-pixel row is 16-bytes -+ __m128i *pZRow01 = (__m128i*)pSrc; -+ __m128i vQuad00 = _mm_load_si128(pZRow01); -+ __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); -+ -+ __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); -+ __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); -+ -+ _mm_storeu_si128((__m128i*)ppDsts[0], vRow00); -+ _mm_storeu_si128((__m128i*)ppDsts[1], vRow10); -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StorePixels (32-bit pixel specialization) -+/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -+/// @param ppDsts - Array of destination pointers. Each pointer is -+/// to a single row of at most 16B. -+/// @tparam NumDests - Number of destination pointers. Each pair of -+/// pointers is for a 16-byte column of two rows. -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct StorePixels<64, 4> -+{ -+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) -+ { -+ // Each 4-pixel row is 32 bytes. -+ const __m128i* pPixSrc = (const __m128i*)pSrc; -+ -+ // order of pointers match SWR-Z layout -+ __m128i** pvDsts = (__m128i**)&ppDsts[0]; -+ *pvDsts[0] = pPixSrc[0]; -+ *pvDsts[1] = pPixSrc[1]; -+ *pvDsts[2] = pPixSrc[2]; -+ *pvDsts[3] = pPixSrc[3]; -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StorePixels (32-bit pixel specialization) -+/// @brief Stores a 4x2 (AVX) raster-tile to two rows. -+/// @param pSrc - Pointer to source raster tile in SWRZ pixel order -+/// @param ppDsts - Array of destination pointers. Each pointer is -+/// to a single row of at most 16B. -+/// @tparam NumDests - Number of destination pointers. Each pair of -+/// pointers is for a 16-byte column of two rows. -+////////////////////////////////////////////////////////////////////////// -+template <> -+struct StorePixels<128, 8> -+{ -+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) -+ { -+ // Each 4-pixel row is 64 bytes. -+ const __m128i* pPixSrc = (const __m128i*)pSrc; -+ -+ // Unswizzle from SWR-Z order -+ __m128i** pvDsts = (__m128i**)&ppDsts[0]; -+ *pvDsts[0] = pPixSrc[0]; -+ *pvDsts[1] = pPixSrc[2]; -+ *pvDsts[2] = pPixSrc[1]; -+ *pvDsts[3] = pPixSrc[3]; -+ *pvDsts[4] = pPixSrc[4]; -+ *pvDsts[5] = pPixSrc[6]; -+ *pvDsts[6] = pPixSrc[5]; -+ *pvDsts[7] = pPixSrc[7]; -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) -+////////////////////////////////////////////////////////////////////////// -+template -+struct ConvertPixelsSOAtoAOS -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Converts a SIMD from the Hot Tile to the destination format -+ /// and converts from SOA to AOS. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDst - Pointer to destination surface or deswizzling buffer. -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel -+ -+ OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; -+ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; -+ -+ // Convert from SrcFormat --> DstFormat -+ simdvector src; -+ LoadSOA(pSrc, src); -+ StoreSOA(src, soaTile); -+ -+ // Convert from SOA --> AOS -+ FormatTraits::TransposeT::Transpose(soaTile, aosTile); -+ -+ // Store data into destination -+ StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) -+/// Specialization for no format conversion -+////////////////////////////////////////////////////////////////////////// -+template -+struct ConvertPixelsSOAtoAOS -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Converts a SIMD from the Hot Tile to the destination format -+ /// and converts from SOA to AOS. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDst - Pointer to destination surface or deswizzling buffer. -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel -+ -+ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; -+ -+ // Convert from SOA --> AOS -+ FormatTraits::TransposeT::Transpose(pSrc, aosTile); -+ -+ // Store data into destination -+ StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) -+////////////////////////////////////////////////////////////////////////// -+template<> -+struct ConvertPixelsSOAtoAOS -+{ -+ static const SWR_FORMAT SrcFormat = R32_FLOAT; -+ static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Converts a SIMD from the Hot Tile to the destination format -+ /// and converts from SOA to AOS. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDst - Pointer to destination surface or deswizzling buffer. -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel -+ -+ OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; -+ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; -+ -+ // Convert from SrcFormat --> DstFormat -+ simdvector src; -+ LoadSOA(pSrc, src); -+ StoreSOA(src, soaTile); -+ -+ // Convert from SOA --> AOS -+ FormatTraits::TransposeT::Transpose(soaTile, aosTile); -+ -+ // Store data into destination but don't overwrite the X8 bits -+ // Each 4-pixel row is 16-bytes -+ __m128i *pZRow01 = (__m128i*)aosTile; -+ __m128i vQuad00 = _mm_load_si128(pZRow01); -+ __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); -+ -+ __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); -+ __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); -+ -+ __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]); -+ __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]); -+ -+ __m128i vMask = _mm_set1_epi32(0xFFFFFF); -+ -+ vDst0 = _mm_andnot_si128(vMask, vDst0); -+ vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask)); -+ vDst1 = _mm_andnot_si128(vMask, vDst1); -+ vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask)); -+ -+ _mm_storeu_si128((__m128i*)ppDsts[0], vDst0); -+ _mm_storeu_si128((__m128i*)ppDsts[1], vDst1); -+ } -+}; -+ -+template -+INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) -+{ -+ static const uint32_t offset = sizeof(simdscalar); -+ -+ // swizzle rgba -> bgra while we load -+ simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(0))*offset)); // float32 rrrrrrrr -+ simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(1))*offset)); // float32 gggggggg -+ simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(2))*offset)); // float32 bbbbbbbb -+ simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(3))*offset)); // float32 aaaaaaaa -+ -+ // clamp -+ vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); -+ vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); -+ -+ vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); -+ vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); -+ -+ vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); -+ vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); -+ -+ vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); -+ vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); -+ -+ if (FormatTraits::isSRGB) -+ { -+ // Gamma-correct only rgb -+ vComp0 = FormatTraits::convertSrgb(0, vComp0); -+ vComp1 = FormatTraits::convertSrgb(1, vComp1); -+ vComp2 = FormatTraits::convertSrgb(2, vComp2); -+ } -+ -+ // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format -+ vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits::fromFloat(0))); -+ vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits::fromFloat(1))); -+ vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits::fromFloat(2))); -+ vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits::fromFloat(3))); -+ -+ // moving to 8 wide integer vector types -+ __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr -+ __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg -+ __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb -+ __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa -+ -+#if KNOB_ARCH == KNOB_ARCH_AVX -+ -+ // splitting into two sets of 4 wide integer vector types -+ // because AVX doesn't have instructions to support this operation at 8 wide -+ __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r -+ __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g -+ __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b -+ __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a -+ -+ __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r -+ __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g -+ __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b -+ __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a -+ -+ srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 -+ srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 -+ srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 -+ srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 -+ srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 -+ srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 -+ -+ srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr -+ srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00 -+ -+ srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr -+ srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00 -+ -+ srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr -+ srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr -+ -+ // unpack into rows that get the tiling order correct -+ __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr -+ __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); -+ -+ __m256i final = _mm256_castsi128_si256(vRow00); -+ final = _mm256_insertf128_si256(final, vRow10, 1); -+ -+#elif KNOB_ARCH == KNOB_ARCH_AVX2 -+ -+ // logic is as above, only wider -+ src1 = _mm256_slli_si256(src1, 1); -+ src2 = _mm256_slli_si256(src2, 2); -+ src3 = _mm256_slli_si256(src3, 3); -+ -+ src0 = _mm256_or_si256(src0, src1); -+ src2 = _mm256_or_si256(src2, src3); -+ -+ __m256i final = _mm256_or_si256(src0, src2); -+ -+ // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 -+ final = _mm256_permute4x64_epi64(final, 0xD8); -+ -+#endif -+ -+ _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); -+} -+ -+template -+INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) -+{ -+ static const uint32_t offset = sizeof(simdscalar); -+ -+ // swizzle rgba -> bgra while we load -+ simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(0))*offset)); // float32 rrrrrrrr -+ simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(1))*offset)); // float32 gggggggg -+ simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(2))*offset)); // float32 bbbbbbbb -+ // clamp -+ vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); -+ vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); -+ -+ vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); -+ vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); -+ -+ vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); -+ vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); -+ -+ if (FormatTraits::isSRGB) -+ { -+ // Gamma-correct only rgb -+ vComp0 = FormatTraits::convertSrgb(0, vComp0); -+ vComp1 = FormatTraits::convertSrgb(1, vComp1); -+ vComp2 = FormatTraits::convertSrgb(2, vComp2); -+ } -+ -+ // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format -+ vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits::fromFloat(0))); -+ vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits::fromFloat(1))); -+ vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits::fromFloat(2))); -+ -+ // moving to 8 wide integer vector types -+ __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr -+ __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg -+ __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb -+ -+#if KNOB_ARCH == KNOB_ARCH_AVX -+ -+ // splitting into two sets of 4 wide integer vector types -+ // because AVX doesn't have instructions to support this operation at 8 wide -+ __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r -+ __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g -+ __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b -+ -+ __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r -+ __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g -+ __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b -+ -+ srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 -+ srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 -+ srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 -+ srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 -+ -+ srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr -+ -+ srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr -+ -+ srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr -+ srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr -+ -+ // unpack into rows that get the tiling order correct -+ __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr -+ __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); -+ -+ __m256i final = _mm256_castsi128_si256(vRow00); -+ final = _mm256_insertf128_si256(final, vRow10, 1); -+ -+#elif KNOB_ARCH == KNOB_ARCH_AVX2 -+ -+ // logic is as above, only wider -+ src1 = _mm256_slli_si256(src1, 1); -+ src2 = _mm256_slli_si256(src2, 2); -+ -+ src0 = _mm256_or_si256(src0, src1); -+ -+ __m256i final = _mm256_or_si256(src0, src2); -+ -+ // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 -+ final = _mm256_permute4x64_epi64(final, 0xD8); -+ -+#endif -+ -+ _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); -+} -+ -+template<> -+struct ConvertPixelsSOAtoAOS -+{ -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ FlatConvert(pSrc, ppDsts[0], ppDsts[1]); -+ } -+}; -+ -+template<> -+struct ConvertPixelsSOAtoAOS -+{ -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); -+ } -+}; -+ -+template<> -+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > -+{ -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ FlatConvert(pSrc, ppDsts[0], ppDsts[1]); -+ } -+}; -+ -+template<> -+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > -+{ -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); -+ } -+}; -+ -+template<> -+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > -+{ -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ FlatConvert(pSrc, ppDsts[0], ppDsts[1]); -+ } -+}; -+ -+template<> -+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > -+{ -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); -+ } -+}; -+ -+template<> -+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > -+{ -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ FlatConvert(pSrc, ppDsts[0], ppDsts[1]); -+ } -+}; -+ -+template<> -+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > -+{ -+ template -+ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) -+ { -+ FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StoreRasterTile -+////////////////////////////////////////////////////////////////////////// -+template -+struct StoreRasterTile -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Retrieve color from hot tile source which is always float. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param x, y - Coordinates to raster tile. -+ /// @param output - output color -+ INLINE static void GetSwizzledSrcColor( -+ uint8_t* pSrc, -+ uint32_t x, uint32_t y, -+ float outputColor[4]) -+ { -+ typedef SimdTile SimdT; -+ -+ SimdT* pSrcSimdTiles = (SimdT*)pSrc; -+ -+ // Compute which simd tile we're accessing within 8x8 tile. -+ // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. -+ uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); -+ -+ SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; -+ -+ uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); -+ -+ pSimdTile->GetSwizzledColor(simdOffset, outputColor); -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. -+ { -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ -+ // For each raster tile pixel (rx, ry) -+ for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) -+ { -+ for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) -+ { -+ // Perform bounds checking. -+ if (((x + rx) < lodWidth) && -+ ((y + ry) < lodHeight)) -+ { -+ float srcColor[4]; -+ GetSwizzledSrcColor(pSrc, rx, ry, srcColor); -+ -+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress((x + rx), (y + ry), -+ pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ sampleNum, pDstSurface->lod, pDstSurface); -+ ConvertPixelFromFloat(pDst, srcColor); -+ } -+ } -+ } -+ } -+}; -+ -+template -+struct OptStoreRasterTile : StoreRasterTile -+{}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; -+ -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) -+ { -+ uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; -+ -+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) -+ { -+ // Format conversion and convert from SOA to AOS, and store the rows. -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); -+ -+ ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; -+ ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;; -+ pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; -+ } -+ -+ ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; -+ ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; -+ -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) -+ { -+ uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; -+ -+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) -+ { -+ // Format conversion and convert from SOA to AOS, and store the rows. -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); -+ -+ ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; -+ ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;; -+ pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; -+ } -+ -+ ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; -+ ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; -+ -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) -+ { -+ uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; -+ -+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) -+ { -+ // Format conversion and convert from SOA to AOS, and store the rows. -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); -+ -+ ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; -+ ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;; -+ pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; -+ } -+ -+ ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; -+ ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ static const size_t MAX_DST_COLUMN_BYTES = 16; -+ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; -+ static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ uint8_t* ppDsts[] = -+ { -+ pDst, // row 0, col 0 -+ pDst + pDstSurface->pitch, // row 1, col 0 -+ pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 -+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 -+ }; -+ -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) -+ { -+ uint8_t* ppStartRows[] = -+ { -+ ppDsts[0], -+ ppDsts[1], -+ ppDsts[2], -+ ppDsts[3], -+ }; -+ -+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) -+ { -+ // Format conversion and convert from SOA to AOS, and store the rows. -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ -+ ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; -+ ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; -+ ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; -+ ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; -+ pSrc += SRC_COLUMN_BYTES; -+ } -+ -+ ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch; -+ ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch; -+ ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; -+ ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ static const size_t MAX_DST_COLUMN_BYTES = 16; -+ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; -+ static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ struct DstPtrs -+ { -+ uint8_t* ppDsts[8]; -+ } ptrs; -+ -+ // Need 8 pointers, 4 columns of 2 rows each -+ for (uint32_t y = 0; y < 2; ++y) -+ { -+ for (uint32_t x = 0; x < 4; ++x) -+ { -+ ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; -+ } -+ } -+ -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) -+ { -+ DstPtrs startPtrs = ptrs; -+ -+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) -+ { -+ // Format conversion and convert from SOA to AOS, and store the rows. -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); -+ -+ ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; -+ pSrc += SRC_COLUMN_BYTES; -+ } -+ -+ ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch; -+ ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch; -+ ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch; -+ ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch; -+ ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch; -+ ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch; -+ ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; -+ ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ static const uint32_t DestRowWidthBytes = 16; // 16B rows -+ -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. -+ // We can compute the offsets to each column within the raster tile once and increment from these. -+ // There will be 2 x 4-wide columns in an 8x8 raster tile. -+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ -+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. -+ uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; -+ -+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) -+ { -+ uint32_t rowOffset = row * DestRowWidthBytes; -+ -+ uint8_t* pRow = pCol0 + rowOffset; -+ uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; -+ -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ pSrc += pSrcInc; -+ -+ ppDsts[0] += DestRowWidthBytes / 4; -+ ppDsts[1] += DestRowWidthBytes / 4; -+ -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ pSrc += pSrcInc; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ static const uint32_t DestRowWidthBytes = 16; // 16B rows -+ -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. -+ // We can compute the offsets to each column within the raster tile once and increment from these. -+ // There will be 2 x 4-wide columns in an 8x8 raster tile. -+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ -+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. -+ uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; -+ -+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) -+ { -+ uint32_t rowOffset = row * DestRowWidthBytes; -+ -+ uint8_t* pRow = pCol0 + rowOffset; -+ uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; -+ -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ pSrc += pSrcInc; -+ -+ ppDsts[0] += DestRowWidthBytes / 2; -+ ppDsts[1] += DestRowWidthBytes / 2; -+ -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ pSrc += pSrcInc; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ static const uint32_t DestRowWidthBytes = 512; // 512B rows -+ -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. -+ // We can compute the offsets to each column within the raster tile once and increment from these. -+ uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ uint8_t* pRow1 = pRow0 + DestRowWidthBytes; -+ -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) -+ { -+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM) -+ { -+ uint32_t xRowOffset = col * (FormatTraits::bpp / 8); -+ -+ uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset }; -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ -+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. -+ pSrc += (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; -+ } -+ -+ pRow0 += (DestRowWidthBytes * 2); -+ pRow1 += (DestRowWidthBytes * 2); -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ static const uint32_t DestRowWidthBytes = 16; // 16B rows -+ static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. -+ -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. -+ // We can compute the offsets to each column within the raster tile once and increment from these. -+ // There will be 2 x 4-wide columns in an 8x8 raster tile. -+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ -+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. -+ uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; -+ -+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) -+ { -+ uint32_t rowOffset = row * DestRowWidthBytes; -+ -+ uint8_t* pRow = pCol0 + rowOffset; -+ uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; -+ -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ pSrc += pSrcInc; -+ -+ ppDsts[0] += DestColumnBytes; -+ ppDsts[1] += DestColumnBytes; -+ -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ pSrc += pSrcInc; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ static const uint32_t DestRowWidthBytes = 16; // 16B rows -+ static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. -+ -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. -+ // We can compute the offsets to each column within the raster tile once and increment from these. -+ // There will be 2 x 4-wide columns in an 8x8 raster tile. -+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ uint8_t* pCol1 = pCol0 + DestColumnBytes; -+ -+ // There are 4 columns, each 2 pixels wide when we have 64bpp pixels. -+ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. -+ uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; -+ -+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) -+ { -+ uint32_t rowOffset = row * DestRowWidthBytes; -+ uint8_t* ppDsts[] = -+ { -+ pCol0 + rowOffset, -+ pCol0 + rowOffset + DestRowWidthBytes, -+ pCol1 + rowOffset, -+ pCol1 + rowOffset + DestRowWidthBytes, -+ }; -+ -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ pSrc += pSrcInc; -+ -+ ppDsts[0] += DestColumnBytes * 2; -+ ppDsts[1] += DestColumnBytes * 2; -+ ppDsts[2] += DestColumnBytes * 2; -+ ppDsts[3] += DestColumnBytes * 2; -+ -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); -+ pSrc += pSrcInc; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp -+////////////////////////////////////////////////////////////////////////// -+template -+struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > -+{ -+ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -+ -+ static const size_t TILE_Y_COL_WIDTH_BYTES = 16; -+ static const size_t TILE_Y_ROWS = 32; -+ static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; -+ -+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; -+ static const size_t MAX_DST_COLUMN_BYTES = 16; -+ -+ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; -+ static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores an 8x8 raster tile to the destination surface. -+ /// @param pSrc - Pointer to raster tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to raster tile. -+ INLINE static void Store( -+ uint8_t *pSrc, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) -+ { -+ // Punt non-full tiles to generic store -+ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); -+ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); -+ if (x + KNOB_TILE_X_DIM > lodWidth || -+ y + KNOB_TILE_Y_DIM > lodHeight) -+ { -+ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); -+ } -+ -+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -+ struct DstPtrs -+ { -+ uint8_t* ppDsts[8]; -+ } ptrs; -+ -+ // Need 8 pointers, 4 columns of 2 rows each -+ for (uint32_t y = 0; y < 2; ++y) -+ { -+ for (uint32_t x = 0; x < 4; ++x) -+ { -+ ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES; -+ } -+ } -+ -+ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) -+ { -+ DstPtrs startPtrs = ptrs; -+ -+ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) -+ { -+ // Format conversion and convert from SOA to AOS, and store the rows. -+ ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); -+ -+ ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; -+ ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; -+ pSrc += SRC_COLUMN_BYTES; -+ } -+ -+ ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES; -+ ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES; -+ ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES; -+ ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES; -+ ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES; -+ ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES; -+ ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; -+ ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// StoreMacroTile - Stores a macro tile which consists of raster tiles. -+////////////////////////////////////////////////////////////////////////// -+template -+struct StoreMacroTile -+{ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores a macrotile to the destination surface using safe implementation. -+ /// @param pSrc - Pointer to macro tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to macro tile -+ static void StoreGeneric( -+ uint8_t *pSrcHotTile, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) -+ { -+ // Store each raster tile from the hot tile to the destination surface. -+ for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) -+ { -+ for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) -+ { -+ for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) -+ { -+ StoreRasterTile::Store (pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, -+ renderTargetArrayIndex); -+ pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); -+ } -+ } -+ } -+ } -+ -+ typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Stores a macrotile to the destination surface. -+ /// @param pSrc - Pointer to macro tile. -+ /// @param pDstSurface - Destination surface state -+ /// @param x, y - Coordinates to macro tile -+ static void Store( -+ uint8_t *pSrcHotTile, -+ SWR_SURFACE_STATE* pDstSurface, -+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) -+ { -+ PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; -+ for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) -+ { -+ size_t dstSurfAddress = (size_t)ComputeSurfaceAddress( -+ 0, -+ 0, -+ pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces -+ pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays -+ sampleNum, -+ pDstSurface->lod, -+ pDstSurface); -+ -+ // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear -+ bool bForceGeneric = (pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff)); -+ -+ pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile::Store : OptStoreRasterTile::Store; -+ } -+ -+ // Store each raster tile from the hot tile to the destination surface. -+ for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) -+ { -+ for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) -+ { -+ for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) -+ { -+ pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); -+ pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); -+ } -+ } -+ } -+ } -+}; -+ -+static void BUCKETS_START(UINT id) -+{ -+#ifdef KNOB_ENABLE_RDTSC -+ gBucketMgr.StartBucket(id); -+#endif -+} -+ -+static void BUCKETS_STOP(UINT id) -+{ -+#ifdef KNOB_ENABLE_RDTSC -+ gBucketMgr.StopBucket(id); -+#endif -+} -+ -+// on demand buckets for store tiles -+static std::mutex sBucketMutex; -+static std::vector sBuckets(NUM_SWR_FORMATS, -1); -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Deswizzles and stores a full hottile to a render surface -+/// @param hPrivateContext - Handle to private DC -+/// @param srcFormat - Format for hot tile. -+/// @param renderTargetIndex - Index to destination render target -+/// @param x, y - Coordinates to raster tile. -+/// @param pSrcHotTile - Pointer to Hot Tile -+void StoreHotTile( -+ SWR_SURFACE_STATE *pDstSurface, -+ SWR_FORMAT srcFormat, -+ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, -+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, -+ uint8_t *pSrcHotTile) -+{ -+ // shouldn't ever see a null surface come through StoreTiles -+ SWR_ASSERT(pDstSurface->type != SURFACE_NULL); -+ -+ PFN_STORE_TILES pfnStoreTiles = nullptr; -+ if(renderTargetIndex <= SWR_ATTACHMENT_COLOR7) -+ { -+ pfnStoreTiles = sStoreTilesTableColor[pDstSurface->tileMode][pDstSurface->format]; -+ } -+ else if(renderTargetIndex == SWR_ATTACHMENT_DEPTH) -+ { -+ pfnStoreTiles = sStoreTilesTableDepth[pDstSurface->tileMode][pDstSurface->format]; -+ } -+ else -+ { -+ pfnStoreTiles = sStoreTilesTableStencil[pDstSurface->tileMode][pDstSurface->format]; -+ } -+ -+ if(nullptr == pfnStoreTiles) -+ { -+ SWR_ASSERT(false, "Invalid pixel format / tile mode for store tiles"); -+ } -+ -+ // Store a macro tile -+#ifdef KNOB_ENABLE_RDTSC -+ if (sBuckets[pDstSurface->format] == -1) -+ { -+ // guard sBuckets update since storetiles is called by multiple threads -+ sBucketMutex.lock(); -+ if (sBuckets[pDstSurface->format] == -1) -+ { -+ const SWR_FORMAT_INFO& info = GetFormatInfo(pDstSurface->format); -+ BUCKET_DESC desc{info.name, "", false, 0xffffffff}; -+ sBuckets[pDstSurface->format] = gBucketMgr.RegisterBucket(desc); -+ } -+ sBucketMutex.unlock(); -+ } -+#endif -+ -+ BUCKETS_START(sBuckets[pDstSurface->format]); -+ pfnStoreTiles(pSrcHotTile, pDstSurface, x, y, renderTargetArrayIndex); -+ BUCKETS_STOP(sBuckets[pDstSurface->format]); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// InitStoreTilesTable - Helper for setting up the tables. -+template -+void InitStoreTilesTableColor( -+ PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) -+{ -+ table[TileModeT][R32G32B32A32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; -+ table[TileModeT][R32G32B32A32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; -+ table[TileModeT][R32G32B32A32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; -+ table[TileModeT][R32G32B32X32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; -+ table[TileModeT][R32G32B32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; -+ table[TileModeT][R32G32B32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; -+ table[TileModeT][R32G32B32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; -+ table[TileModeT][R16G16B16A16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; -+ table[TileModeT][R16G16B16A16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; -+ table[TileModeT][R16G16B16A16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; -+ table[TileModeT][R16G16B16A16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; -+ table[TileModeT][R16G16B16A16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; -+ table[TileModeT][R32G32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; -+ table[TileModeT][R32G32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_SINT>::Store; -+ table[TileModeT][R32G32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_UINT>::Store; -+ table[TileModeT][R16G16B16X16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; -+ table[TileModeT][R16G16B16X16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; -+ table[TileModeT][B8G8R8A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; -+ table[TileModeT][B8G8R8A8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; -+ -+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now -+ table[TileModeT][R10G10B10A2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; -+ table[TileModeT][R10G10B10A2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; -+ table[TileModeT][R10G10B10A2_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; -+ -+ table[TileModeT][R8G8B8A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; -+ table[TileModeT][R8G8B8A8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; -+ table[TileModeT][R8G8B8A8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; -+ table[TileModeT][R8G8B8A8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; -+ table[TileModeT][R8G8B8A8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; -+ table[TileModeT][R16G16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; -+ table[TileModeT][R16G16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; -+ table[TileModeT][R16G16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_SINT>::Store; -+ table[TileModeT][R16G16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_UINT>::Store; -+ table[TileModeT][R16G16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; -+ -+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now -+ table[TileModeT][B10G10R10A2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; -+ table[TileModeT][B10G10R10A2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; -+ table[TileModeT][R11G11B10_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; -+ -+ table[TileModeT][R32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_SINT>::Store; -+ table[TileModeT][R32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_UINT>::Store; -+ table[TileModeT][R32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_FLOAT>::Store; -+ table[TileModeT][A32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, A32_FLOAT>::Store; -+ table[TileModeT][B8G8R8X8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; -+ table[TileModeT][B8G8R8X8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; -+ table[TileModeT][R8G8B8X8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; -+ table[TileModeT][R8G8B8X8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; -+ -+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now -+ table[TileModeT][B10G10R10X2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; -+ table[TileModeT][B5G6R5_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreGeneric; -+ table[TileModeT][B5G6R5_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; -+ table[TileModeT][B5G5R5A1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; -+ table[TileModeT][B5G5R5A1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; -+ table[TileModeT][B4G4R4A4_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; -+ table[TileModeT][B4G4R4A4_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; -+ -+ table[TileModeT][R8G8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; -+ table[TileModeT][R8G8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; -+ table[TileModeT][R8G8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_SINT>::Store; -+ table[TileModeT][R8G8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_UINT>::Store; -+ table[TileModeT][R16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16_UNORM>::Store; -+ table[TileModeT][R16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16_SNORM>::Store; -+ table[TileModeT][R16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_SINT>::Store; -+ table[TileModeT][R16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_UINT>::Store; -+ table[TileModeT][R16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_FLOAT>::Store; -+ table[TileModeT][A16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, A16_UNORM>::Store; -+ table[TileModeT][A16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, A16_FLOAT>::Store; -+ -+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now -+ table[TileModeT][B5G5R5X1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; -+ table[TileModeT][B5G5R5X1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; -+ -+ table[TileModeT][R8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UNORM>::Store; -+ table[TileModeT][R8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8_SNORM>::Store; -+ table[TileModeT][R8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_SINT>::Store; -+ table[TileModeT][R8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UINT>::Store; -+ table[TileModeT][A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, A8_UNORM>::Store; -+ table[TileModeT][BC1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC1_UNORM>::Store; -+ table[TileModeT][BC2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC2_UNORM>::Store; -+ table[TileModeT][BC3_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC3_UNORM>::Store; -+ table[TileModeT][BC4_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC4_UNORM>::Store; -+ table[TileModeT][BC5_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC5_UNORM>::Store; -+ table[TileModeT][BC1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::Store; -+ table[TileModeT][BC2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::Store; -+ table[TileModeT][BC3_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::Store; -+ table[TileModeT][R8G8B8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; -+ table[TileModeT][R8G8B8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; -+ table[TileModeT][BC4_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC4_SNORM>::Store; -+ table[TileModeT][BC5_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC5_SNORM>::Store; -+ table[TileModeT][R16G16B16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; -+ table[TileModeT][R16G16B16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; -+ table[TileModeT][R16G16B16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; -+ table[TileModeT][R8G8B8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; -+ table[TileModeT][R16G16B16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; -+ table[TileModeT][R16G16B16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; -+ -+ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now -+ table[TileModeT][R10G10B10A2_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; -+ table[TileModeT][R10G10B10A2_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; -+ table[TileModeT][B10G10R10A2_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; -+ table[TileModeT][B10G10R10A2_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; -+ table[TileModeT][B10G10R10A2_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; -+ -+ table[TileModeT][R8G8B8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; -+ table[TileModeT][R8G8B8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. -+template -+void InitStoreTilesTableDepth( -+ PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) -+{ -+ table[TileModeT][R32_FLOAT] = StoreMacroTile, R32_FLOAT, R32_FLOAT>::Store; -+ table[TileModeT][R24_UNORM_X8_TYPELESS] = StoreMacroTile, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; -+ table[TileModeT][R16_UNORM] = StoreMacroTile, R32_FLOAT, R16_UNORM>::Store; -+} -+ -+template -+void InitStoreTilesTableStencil( -+ PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) -+{ -+ table[TileModeT][R8_UINT] = StoreMacroTile, R8_UINT, R8_UINT>::Store; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Sets up tables for StoreTile -+void InitSimStoreTilesTable() -+{ -+ InitStoreTilesTableColor(sStoreTilesTableColor); -+ InitStoreTilesTableDepth(sStoreTilesTableDepth); -+ InitStoreTilesTableStencil(sStoreTilesTableStencil); -+ -+ InitStoreTilesTableColor(sStoreTilesTableColor); -+ InitStoreTilesTableColor(sStoreTilesTableColor); -+ -+ InitStoreTilesTableDepth(sStoreTilesTableDepth); -+ InitStoreTilesTableStencil(sStoreTilesTableStencil); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h -new file mode 100644 -index 0000000..78f54f8 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h -@@ -0,0 +1,518 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file TilingFunctions.h -+* -+* @brief Tiling functions. -+* -+******************************************************************************/ -+#pragma once -+ -+#include "core/state.h" -+#include "core/format_traits.h" -+#include "memory/tilingtraits.h" -+ -+#include -+ -+#define MAX_NUM_LOD 15 -+ -+#define GFX_ALIGN(x, a) (((x) + ((a) - 1)) - (((x) + ((a) - 1)) & ((a) - 1))) // Alt implementation with bitwise not (~) has issue with uint32 align used with 64-bit value, since ~'ed value will remain 32-bit. -+ -+////////////////////////////////////////////////////////////////////////// -+/// SimdTile SSE(2x2), AVX(4x2), or AVX-512(4x4?) -+////////////////////////////////////////////////////////////////////////// -+template -+struct SimdTile -+{ -+ // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa ) -+ float color[FormatTraits::numComps][KNOB_SIMD_WIDTH]; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Retrieve color from simd. -+ /// @param index - linear index to color within simd. -+ /// @param outputColor - output color -+ INLINE void GetSwizzledColor( -+ uint32_t index, -+ float outputColor[4]) -+ { -+ // SOA pattern for 2x2 is a subset of 4x2. -+ // 0 1 4 5 -+ // 2 3 6 7 -+ // The offset converts pattern to linear -+#if (SIMD_TILE_X_DIM == 4) -+ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; -+#elif (SIMD_TILE_X_DIM == 2) -+ static const uint32_t offset[] = { 0, 1, 2, 3 }; -+#endif -+ -+ for (uint32_t i = 0; i < FormatTraits::numComps; ++i) -+ { -+ outputColor[i] = this->color[FormatTraits::swizzle(i)][offset[index]]; -+ } -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Retrieve color from simd. -+ /// @param index - linear index to color within simd. -+ /// @param outputColor - output color -+ INLINE void SetSwizzledColor( -+ uint32_t index, -+ const float src[4]) -+ { -+ // SOA pattern for 2x2 is a subset of 4x2. -+ // 0 1 4 5 -+ // 2 3 6 7 -+ // The offset converts pattern to linear -+#if (SIMD_TILE_X_DIM == 4) -+ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; -+#elif (SIMD_TILE_X_DIM == 2) -+ static const uint32_t offset[] = { 0, 1, 2, 3 }; -+#endif -+ -+ // Only loop over the components needed for destination. -+ for (uint32_t i = 0; i < FormatTraits::numComps; ++i) -+ { -+ this->color[i][offset[index]] = src[i]; -+ } -+ } -+}; -+ -+template<> -+struct SimdTile -+{ -+ // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa ) -+ uint8_t color[FormatTraits::numComps][KNOB_SIMD_WIDTH]; -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Retrieve color from simd. -+ /// @param index - linear index to color within simd. -+ /// @param outputColor - output color -+ INLINE void GetSwizzledColor( -+ uint32_t index, -+ float outputColor[4]) -+ { -+ // SOA pattern for 2x2 is a subset of 4x2. -+ // 0 1 4 5 -+ // 2 3 6 7 -+ // The offset converts pattern to linear -+#if (SIMD_TILE_X_DIM == 4) -+ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; -+#elif (SIMD_TILE_X_DIM == 2) -+ static const uint32_t offset[] = { 0, 1, 2, 3 }; -+#endif -+ -+ for (uint32_t i = 0; i < FormatTraits::numComps; ++i) -+ { -+ uint32_t src = this->color[FormatTraits::swizzle(i)][offset[index]]; -+ outputColor[i] = *(float*)&src; -+ } -+ } -+ -+ ////////////////////////////////////////////////////////////////////////// -+ /// @brief Retrieve color from simd. -+ /// @param index - linear index to color within simd. -+ /// @param outputColor - output color -+ INLINE void SetSwizzledColor( -+ uint32_t index, -+ const float src[4]) -+ { -+ // SOA pattern for 2x2 is a subset of 4x2. -+ // 0 1 4 5 -+ // 2 3 6 7 -+ // The offset converts pattern to linear -+#if (SIMD_TILE_X_DIM == 4) -+ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; -+#elif (SIMD_TILE_X_DIM == 2) -+ static const uint32_t offset[] = { 0, 1, 2, 3 }; -+#endif -+ -+ // Only loop over the components needed for destination. -+ for (uint32_t i = 0; i < FormatTraits::numComps; ++i) -+ { -+ this->color[i][offset[index]] = *(uint8_t*)&src[i]; -+ } -+ } -+}; -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes lod offset for 1D surface at specified lod. -+/// @param baseWidth - width of basemip (mip 0). -+/// @param hAlign - horizontal alignment per miip, in texels -+/// @param lod - lod index -+/// @param offset - output offset. -+INLINE void ComputeLODOffset1D( -+ const SWR_FORMAT_INFO& info, -+ uint32_t baseWidth, -+ uint32_t hAlign, -+ uint32_t lod, -+ uint32_t &offset) -+{ -+ if (lod == 0) -+ { -+ offset = 0; -+ } -+ else -+ { -+ uint32_t curWidth = baseWidth; -+ // translate mip width from pixels to blocks for block compressed formats -+ // @note hAlign is already in blocks for compressed formats so no need to convert -+ if (info.isBC) curWidth /= info.bcWidth; -+ -+ offset = GFX_ALIGN(curWidth, hAlign); -+ for (uint32_t l = 1; l < lod; ++l) -+ { -+ curWidth = GFX_ALIGN(std::max(curWidth >> 1, 1U), hAlign); -+ offset += curWidth; -+ } -+ -+ if (info.isSubsampled) -+ { -+ offset /= info.bcWidth; -+ } -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes x lod offset for 2D surface at specified lod. -+/// @param baseWidth - width of basemip (mip 0). -+/// @param hAlign - horizontal alignment per mip, in texels -+/// @param lod - lod index -+/// @param offset - output offset. -+INLINE void ComputeLODOffsetX( -+ const SWR_FORMAT_INFO& info, -+ uint32_t baseWidth, -+ uint32_t hAlign, -+ uint32_t lod, -+ uint32_t &offset) -+{ -+ if (lod < 2) -+ { -+ offset = 0; -+ } -+ else -+ { -+ uint32_t curWidth = baseWidth; -+ // convert mip width from pixels to blocks for block compressed formats -+ // @note hAlign is already in blocks for compressed formats so no need to convert -+ if (info.isBC) curWidth /= info.bcWidth; -+ -+ curWidth = std::max(curWidth >> 1, 1U); -+ curWidth = GFX_ALIGN(curWidth, hAlign); -+ -+ if (info.isSubsampled) -+ { -+ curWidth /= info.bcWidth; -+ } -+ -+ offset = curWidth; -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes y lod offset for 2D surface at specified lod. -+/// @param baseWidth - width of basemip (mip 0). -+/// @param vAlign - vertical alignment per mip, in rows -+/// @param lod - lod index -+/// @param offset - output offset. -+INLINE void ComputeLODOffsetY( -+ const SWR_FORMAT_INFO& info, -+ uint32_t baseHeight, -+ uint32_t vAlign, -+ uint32_t lod, -+ uint32_t &offset) -+{ -+ if (lod == 0) -+ { -+ offset = 0; -+ } -+ else -+ { -+ offset = 0; -+ uint32_t mipHeight = baseHeight; -+ -+ // translate mip height from pixels to blocks for block compressed formats -+ // @note VAlign is already in blocks for compressed formats so no need to convert -+ if (info.isBC) mipHeight /= info.bcHeight; -+ -+ for (uint32_t l = 1; l <= lod; ++l) -+ { -+ uint32_t alignedMipHeight = GFX_ALIGN(mipHeight, vAlign); -+ offset += ((l != 2) ? alignedMipHeight : 0); -+ mipHeight = std::max(mipHeight >> 1, 1U); -+ } -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes 1D surface offset -+/// @param x - offset from start of array slice at given lod. -+/// @param array - array slice index -+/// @param lod - lod index -+/// @param pState - surface state -+/// @param xOffsetBytes - output offset in bytes. -+template -+INLINE void ComputeSurfaceOffset1D( -+ uint32_t x, -+ uint32_t array, -+ uint32_t lod, -+ const SWR_SURFACE_STATE *pState, -+ uint32_t &xOffsetBytes) -+{ -+ const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); -+ uint32_t lodOffset; -+ -+ if (UseCachedOffsets) -+ { -+ lodOffset = pState->lodOffsets[0][lod]; -+ } -+ else -+ { -+ ComputeLODOffset1D(info, pState->width, pState->halign, lod, lodOffset); -+ } -+ -+ xOffsetBytes = (array * pState->qpitch + lodOffset + x) * info.Bpp; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Adjusts the array slice for legacy TileY MSAA -+/// @param pState - surface state -+/// @param array - array slice index -+/// @param sampleNum - requested sample -+INLINE uint32_t AdjustArrayIndexForMSAA(const SWR_SURFACE_STATE *pState, uint32_t arrayIndex, uint32_t sampleNum) -+{ -+ uint32_t sampleSlice; -+ /// @todo: might want to templatize adjusting for sample slices when we support tileYS/tileYF. -+ if(pState->tileMode == SWR_TILE_MODE_YMAJOR || -+ pState->tileMode == SWR_TILE_NONE) -+ { -+ uint32_t sampleShift; -+ switch(pState->numSamples) -+ { -+ case 1: -+ assert(sampleNum == 0); -+ sampleShift = 0; -+ break; -+ case 2: -+ assert(pState->type == SURFACE_2D); -+ sampleShift = 1; -+ break; -+ case 4: -+ assert(pState->type == SURFACE_2D); -+ sampleShift = 2; -+ break; -+ case 8: -+ assert(pState->type == SURFACE_2D); -+ sampleShift = 3; -+ break; -+ case 16: -+ assert(pState->type == SURFACE_2D); -+ sampleShift = 4; -+ break; -+ default: -+ assert(0 && "Unsupported sample count"); -+ sampleShift = 0; -+ break; -+ } -+ sampleSlice = (arrayIndex << sampleShift) | sampleNum; -+ } -+ else -+ { -+ sampleSlice = arrayIndex; -+ } -+ return sampleSlice; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes 2D surface offset -+/// @param x - horizontal offset from start of array slice and lod. -+/// @param y - vertical offset from start of array slice and lod. -+/// @param array - array slice index -+/// @param lod - lod index -+/// @param pState - surface state -+/// @param xOffsetBytes - output x offset in bytes. -+/// @param yOffsetRows - output y offset in bytes. -+template -+INLINE void ComputeSurfaceOffset2D(uint32_t x, uint32_t y, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows) -+{ -+ const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); -+ uint32_t lodOffsetX, lodOffsetY; -+ -+ if (UseCachedOffsets) -+ { -+ lodOffsetX = pState->lodOffsets[0][lod]; -+ lodOffsetY = pState->lodOffsets[1][lod]; -+ } -+ else -+ { -+ ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX); -+ ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY); -+ } -+ -+ uint32_t arrayIndex = AdjustArrayIndexForMSAA(pState, array, sampleNum); -+ xOffsetBytes = (x + lodOffsetX) * info.Bpp; -+ yOffsetRows = (arrayIndex * pState->qpitch) + lodOffsetY + y; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes 3D surface offset -+/// @param x - horizontal offset from start of array slice and lod. -+/// @param y - vertical offset from start of array slice and lod. -+/// @param z - depth offset from start of array slice and lod. -+/// @param lod - lod index -+/// @param pState - surface state -+/// @param xOffsetBytes - output x offset in bytes. -+/// @param yOffsetRows - output y offset in rows. -+/// @param zOffsetSlices - output y offset in slices. -+template -+INLINE void ComputeSurfaceOffset3D(uint32_t x, uint32_t y, uint32_t z, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows, uint32_t &zOffsetSlices) -+{ -+ const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); -+ uint32_t lodOffsetX, lodOffsetY; -+ -+ if (UseCachedOffsets) -+ { -+ lodOffsetX = pState->lodOffsets[0][lod]; -+ lodOffsetY = pState->lodOffsets[1][lod]; -+ } -+ else -+ { -+ ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX); -+ ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY); -+ } -+ -+ xOffsetBytes = (x + lodOffsetX) * info.Bpp; -+ yOffsetRows = lodOffsetY + y; -+ zOffsetSlices = z; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode -+/// and returns final surface address -+/// @param xOffsetBytes - x offset from base of surface in bytes -+/// @param yOffsetRows - y offset from base of surface in rows -+/// @param pState - pointer to the surface state -+template -+INLINE uint32_t ComputeTileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState) -+{ -+ return ComputeOffset2D(pState->pitch, xOffsetBytes, yOffsetRows); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode -+/// and returns final surface address -+/// @param xOffsetBytes - x offset from base of surface in bytes -+/// @param yOffsetRows - y offset from base of surface in rows -+/// @param pState - pointer to the surface state -+template -+INLINE uint32_t ComputeTileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState) -+{ -+ return ComputeOffset3D(pState->qpitch, pState->pitch, xOffsetBytes, yOffsetRows, zOffsetSlices); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode -+/// and returns final surface address -+/// @param xOffsetBytes - x offset from base of surface in bytes -+/// @param yOffsetRows - y offset from base of surface in rows -+/// @param pState - pointer to the surface state -+INLINE -+uint32_t TileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState) -+{ -+ switch (pState->tileMode) -+ { -+ case SWR_TILE_NONE: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); -+ case SWR_TILE_SWRZ: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); -+ case SWR_TILE_MODE_XMAJOR: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); -+ case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); -+ case SWR_TILE_MODE_WMAJOR: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); -+ default: SWR_ASSERT(0, "Unsupported tiling mode"); -+ } -+ return (uint32_t) NULL; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Swizzles the linear x,y,z offsets depending on surface tiling mode -+/// and returns final surface address -+/// @param xOffsetBytes - x offset from base of surface in bytes -+/// @param yOffsetRows - y offset from base of surface in rows -+/// @param zOffsetSlices - z offset from base of surface in slices -+/// @param pState - pointer to the surface state -+INLINE -+uint32_t TileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState) -+{ -+ switch (pState->tileMode) -+ { -+ case SWR_TILE_NONE: return ComputeTileSwizzle3D >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); -+ case SWR_TILE_SWRZ: return ComputeTileSwizzle3D >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); -+ case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle3D >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); -+ default: SWR_ASSERT(0, "Unsupported tiling mode"); -+ } -+ return (uint32_t) NULL; -+} -+ -+template -+INLINE -+uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) -+{ -+ uint32_t offsetX = 0, offsetY = 0, offsetZ = 0; -+ switch (pState->type) -+ { -+ case SURFACE_BUFFER: -+ case SURFACE_STRUCTURED_BUFFER: -+ offsetX = x * pState->pitch; -+ return offsetX; -+ break; -+ case SURFACE_1D: -+ ComputeSurfaceOffset1D(x, array, lod, pState, offsetX); -+ return TileSwizzle2D(offsetX, 0, pState); -+ break; -+ case SURFACE_2D: -+ ComputeSurfaceOffset2D(x, y, array, sampleNum, lod, pState, offsetX, offsetY); -+ return TileSwizzle2D(offsetX, offsetY, pState); -+ case SURFACE_3D: -+ ComputeSurfaceOffset3D(x, y, z, lod, pState, offsetX, offsetY, offsetZ); -+ return TileSwizzle3D(offsetX, offsetY, offsetZ, pState); -+ break; -+ case SURFACE_CUBE: -+ ComputeSurfaceOffset2D(x, y, array, sampleNum, lod, pState, offsetX, offsetY); -+ return TileSwizzle2D(offsetX, offsetY, pState); -+ break; -+ default: SWR_ASSERT(0, "Unsupported format"); -+ } -+ -+ return (uint32_t) NULL; -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes surface address at the given location and lod -+/// @param x - x location in pixels -+/// @param y - y location in rows -+/// @param z - z location for 3D surfaces -+/// @param array - array slice for 1D and 2D surfaces -+/// @param lod - level of detail -+/// @param pState - pointer to the surface state -+template -+INLINE -+void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) -+{ -+ return pState->pBaseAddress + ComputeSurfaceOffset(x, y, z, array, sampleNum, lod, pState); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h -new file mode 100644 -index 0000000..9dd4cd2 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h -@@ -0,0 +1,239 @@ -+/**************************************************************************** -+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+* -+* Permission is hereby granted, free of charge, to any person obtaining a -+* copy of this software and associated documentation files (the "Software"), -+* to deal in the Software without restriction, including without limitation -+* the rights to use, copy, modify, merge, publish, distribute, sublicense, -+* and/or sell copies of the Software, and to permit persons to whom the -+* Software is furnished to do so, subject to the following conditions: -+* -+* The above copyright notice and this permission notice (including the next -+* paragraph) shall be included in all copies or substantial portions of the -+* Software. -+* -+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+* IN THE SOFTWARE. -+* -+* @file tilingtraits.h -+* -+* @brief Tiling traits. -+* -+******************************************************************************/ -+#pragma once -+ -+#include "core/state.h" -+ -+template -+struct TilingTraits -+{ -+ static const SWR_TILE_MODE TileMode{ mode }; -+ static UINT GetCu() { SWR_ASSERT(0); return 0; } -+ static UINT GetCv() { SWR_ASSERT(0); return 0; } -+ static UINT GetCr() { SWR_ASSERT(0); return 0; } -+ static UINT GetTileIDShift() { SWR_ASSERT(0); return 0; } -+ -+ /// @todo correct pdep shifts for all rastertile dims. Unused for now -+ static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; } -+ static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; } -+}; -+ -+template struct TilingTraits -+{ -+ static const SWR_TILE_MODE TileMode{ SWR_TILE_NONE }; -+ static UINT GetCu() { return 0; } -+ static UINT GetCv() { return 0; } -+ static UINT GetCr() { return 0; } -+ static UINT GetTileIDShift() { return 0; } -+ static UINT GetPdepX() { return 0x00; } -+ static UINT GetPdepY() { return 0x00; } -+}; -+ -+template<> struct TilingTraits -+{ -+ static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; -+ static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT; } -+ static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } -+ static UINT GetCr() { return 0; } -+ static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT; } -+ -+ /// @todo correct pdep shifts for all rastertile dims. Unused for now -+ static UINT GetPdepX() { SWR_ASSERT(0); return 0x00; } -+ static UINT GetPdepY() { SWR_ASSERT(0); return 0x00; } -+}; -+ -+template<> struct TilingTraits -+{ -+ static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; -+ static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 2; } -+ static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } -+ static UINT GetCr() { return 0; } -+ static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 2; } -+ -+ static UINT GetPdepX() { return 0x37; } -+ static UINT GetPdepY() { return 0xC8; } -+}; -+ -+template<> struct TilingTraits -+{ -+ static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; -+ static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 4; } -+ static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } -+ static UINT GetCr() { return 0; } -+ static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 4; } -+ -+ /// @todo correct pdep shifts for all rastertile dims. Unused for now -+ static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; } -+ static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; } -+}; -+ -+// y-major tiling layout unaffected by element size -+template struct TilingTraits -+{ -+ static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_YMAJOR }; -+ static UINT GetCu() { return 7; } -+ static UINT GetCv() { return 5; } -+ static UINT GetCr() { return 0; } -+ static UINT GetTileIDShift() { return 12; } -+ -+ static UINT GetPdepX() { return 0xe0f; } -+ static UINT GetPdepY() { return 0x1f0; } -+}; -+ -+// x-major tiling layout unaffected by element size -+template struct TilingTraits -+{ -+ static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_XMAJOR }; -+ static UINT GetCu() { return 9; } -+ static UINT GetCv() { return 3; } -+ static UINT GetCr() { return 0; } -+ static UINT GetTileIDShift() { return 12; } -+ -+ static UINT GetPdepX() { return 0x1ff; } -+ static UINT GetPdepY() { return 0xe00; } -+}; -+ -+template struct TilingTraits -+{ -+ static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_WMAJOR }; -+ static UINT GetCu() { return 6; } -+ static UINT GetCv() { return 6; } -+ static UINT GetCr() { return 0; } -+ static UINT GetTileIDShift() { return 12; } -+ -+ static UINT GetPdepX() { return 0xe15; } -+ static UINT GetPdepY() { return 0x1ea; } -+}; -+ -+INLINE -+UINT pdep_u32(UINT a, UINT mask) -+{ -+#if KNOB_ARCH==KNOB_ARCH_AVX2 -+ return _pdep_u32(a, mask); -+#else -+ UINT result = 0; -+ -+ // copied from http://wm.ite.pl/articles/pdep-soft-emu.html -+ // using bsf instead of funky loop -+ DWORD maskIndex; -+ while (_BitScanForward(&maskIndex, mask)) -+ { -+ // 1. isolate lowest set bit of mask -+ const UINT lowest = 1 << maskIndex; -+ -+ // 2. populate LSB from src -+ const UINT LSB = (UINT)((int)(a << 31) >> 31); -+ -+ // 3. copy bit from mask -+ result |= LSB & lowest; -+ -+ // 4. clear lowest bit -+ mask &= ~lowest; -+ -+ // 5. prepare for next iteration -+ a >>= 1; -+ } -+ -+ return result; -+#endif -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes the tileID for 2D tiled surfaces -+/// @param pitch - surface pitch in bytes -+/// @param tileX - x offset in tiles -+/// @param tileY - y offset in tiles -+template -+INLINE UINT ComputeTileOffset2D(UINT pitch, UINT tileX, UINT tileY) -+{ -+ UINT tileID = tileY * (pitch >> TTraits::GetCu()) + tileX; -+ return tileID << TTraits::GetTileIDShift(); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes the tileID for 3D tiled surfaces -+/// @param qpitch - surface qpitch in rows -+/// @param pitch - surface pitch in bytes -+/// @param tileX - x offset in tiles -+/// @param tileY - y offset in tiles -+/// @param tileZ - y offset in tiles -+template -+INLINE UINT ComputeTileOffset3D(UINT qpitch, UINT pitch, UINT tileX, UINT tileY, UINT tileZ) -+{ -+ UINT tileID = (tileZ * (qpitch >> TTraits::GetCv()) + tileY) * (pitch >> TTraits::GetCu()) + tileX; -+ return tileID << TTraits::GetTileIDShift(); -+} -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes the byte offset for 2D tiled surfaces -+/// @param pitch - surface pitch in bytes -+/// @param x - x offset in bytes -+/// @param y - y offset in rows -+template -+INLINE UINT ComputeOffset2D(UINT pitch, UINT x, UINT y) -+{ -+ UINT tileID = ComputeTileOffset2D(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv()); -+ UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX()); -+ UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY()); -+ return (tileID | xSwizzle | ySwizzle); -+} -+ -+#if KNOB_ARCH <= KNOB_ARCH_AVX -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes the byte offset for 2D tiled surfaces. Specialization -+/// for tile-y surfaces that uses bit twiddling instead of pdep emulation. -+/// @param pitch - surface pitch in bytes -+/// @param x - x offset in bytes -+/// @param y - y offset in rows -+template<> -+INLINE UINT ComputeOffset2D >(UINT pitch, UINT x, UINT y) -+{ -+ typedef TilingTraits TTraits; -+ -+ UINT tileID = ComputeTileOffset2D(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv()); -+ UINT xSwizzle = ((x << 5) & 0xe00) | (x & 0xf); -+ UINT ySwizzle = (y << 4) & 0x1f0; -+ return (tileID | xSwizzle | ySwizzle); -+} -+#endif -+ -+////////////////////////////////////////////////////////////////////////// -+/// @brief Computes the byte offset for 3D tiled surfaces -+/// @param qpitch - depth pitch in rows -+/// @param pitch - surface pitch in bytes -+/// @param x - x offset in bytes -+/// @param y - y offset in rows -+/// @param z - y offset in slices -+template -+INLINE UINT ComputeOffset3D(UINT qpitch, UINT pitch, UINT x, UINT y, UINT z) -+{ -+ UINT tileID = ComputeTileOffset3D(qpitch, pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv(), z >> TTraits::GetCr()); -+ UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX()); -+ UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY()); -+ return (tileID | xSwizzle | ySwizzle); -+} -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py -new file mode 100644 -index 0000000..a6aa81b ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py -@@ -0,0 +1,79 @@ -+# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+# -+# Permission is hereby granted, free of charge, to any person obtaining a -+# copy of this software and associated documentation files (the "Software"), -+# to deal in the Software without restriction, including without limitation -+# the rights to use, copy, modify, merge, publish, distribute, sublicense, -+# and/or sell copies of the Software, and to permit persons to whom the -+# Software is furnished to do so, subject to the following conditions: -+# -+# The above copyright notice and this permission notice (including the next -+# paragraph) shall be included in all copies or substantial portions of the -+# Software. -+# -+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+# IN THE SOFTWARE. -+ -+# Python source -+from __future__ import print_function -+import os -+import sys -+import knob_defs -+from mako.template import Template -+from mako.exceptions import RichTraceback -+ -+def write_template_to_string(template_filename, **kwargs): -+ try: -+ template = Template(filename=template_filename) -+ # Split + Join fixes line-endings for whatever platform you are using -+ return '\n'.join(template.render(**kwargs).splitlines()) -+ except: -+ traceback = RichTraceback() -+ for (filename, lineno, function, line) in traceback.traceback: -+ print("File %s, line %s, in %s" % (filename, lineno, function)) -+ print(line, "\n") -+ print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error)) -+ -+def write_template_to_file(template_filename, output_filename, **kwargs): -+ with open(output_filename, "w") as outfile: -+ print(write_template_to_string(template_filename, **kwargs), file=outfile) -+ -+def main(args=sys.argv[1:]): -+ if len(args) != 1: -+ print('Usage:', sys.argv[0], '', file=sys.stderr) -+ return 1 -+ -+ output_dir = args[0] -+ if not os.path.isdir(output_dir): -+ if os.path.exists(output_dir): -+ print('ERROR: Invalid output directory:', output_dir, file=sys.stderr) -+ return 1 -+ -+ try: -+ os.makedirs(output_dir) -+ except: -+ print('ERROR: Could not create output directory:', output_dir, file=sys.stderr) -+ return 1 -+ -+ # Output path exists, now just run the template -+ template_file = os.sep.join([sys.path[0], 'templates', 'knobs.template']) -+ output_file = os.sep.join([output_dir, 'gen_knobs.cpp']) -+ output_header = os.sep.join([output_dir, 'gen_knobs.h']) -+ -+ for f in [output_header, output_file]: -+ write_template_to_file(template_file, f, -+ filename='gen_knobs', -+ knobs=knob_defs.KNOBS, -+ includes=['core/knobs_init.h'], -+ gen_header=True if f == output_header else False) -+ -+ return 0 -+ -+if __name__ == '__main__': -+ sys.exit(main()) -+ -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py -new file mode 100644 -index 0000000..0a64953 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py -@@ -0,0 +1,212 @@ -+# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -+# -+# Permission is hereby granted, free of charge, to any person obtaining a -+# copy of this software and associated documentation files (the "Software"), -+# to deal in the Software without restriction, including without limitation -+# the rights to use, copy, modify, merge, publish, distribute, sublicense, -+# and/or sell copies of the Software, and to permit persons to whom the -+# Software is furnished to do so, subject to the following conditions: -+# -+# The above copyright notice and this permission notice (including the next -+# paragraph) shall be included in all copies or substantial portions of the -+# Software. -+# -+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+# IN THE SOFTWARE. -+ -+# Python source -+KNOBS = [ -+ ['ENABLE_ASSERT_DIALOGS', { -+ 'type' : 'bool', -+ 'default' : 'true', -+ 'desc' : ['Use dialogs when asserts fire.', -+ 'Asserts are only enabled in debug builds'], -+ }], -+ -+ ['USE_GENERIC_STORETILE', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Always use generic function for performing StoreTile.', -+ 'Will be slightly slower than using optimized (jitted) path'], -+ }], -+ -+ ['SINGLE_THREADED', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['If enabled will perform all rendering on the API thread.', -+ 'This is useful mainly for debugging purposes.'], -+ }], -+ -+ ['FAST_CLEAR', { -+ 'type' : 'bool', -+ 'default' : 'true', -+ 'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and', -+ 'defer clear execution to first backend op on hottile, or hottile store'], -+ }], -+ -+ ['MAX_NUMA_NODES', { -+ 'type' : 'uint32_t', -+ 'default' : '0', -+ 'desc' : ['Maximum # of NUMA-nodes per system used for worker threads', -+ ' 0 == ALL NUMA-nodes in the system', -+ ' N == Use at most N NUMA-nodes for rendering'], -+ }], -+ -+ ['MAX_CORES_PER_NUMA_NODE', { -+ 'type' : 'uint32_t', -+ 'default' : '0', -+ 'desc' : ['Maximum # of cores per NUMA-node used for worker threads.', -+ ' 0 == ALL non-API thread cores per NUMA-node', -+ ' N == Use at most N cores per NUMA-node'], -+ }], -+ -+ ['MAX_THREADS_PER_CORE', { -+ 'type' : 'uint32_t', -+ 'default' : '1', -+ 'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.', -+ ' 0 == ALL hyper-threads per core', -+ ' N == Use at most N hyper-threads per physical core'], -+ }], -+ -+ ['BUCKETS_START_FRAME', { -+ 'type' : 'uint32_t', -+ 'default' : '1200', -+ 'desc' : ['Frame from when to start saving buckets data.', -+ '', -+ 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', -+ 'for this to have an effect.'], -+ }], -+ -+ ['BUCKETS_END_FRAME', { -+ 'type' : 'uint32_t', -+ 'default' : '1400', -+ 'desc' : ['Frame at which to stop saving buckets data.', -+ '', -+ 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', -+ 'for this to have an effect.'], -+ }], -+ -+ ['TOSS_DRAW', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Disable per-draw execution', -+ '', -+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], -+ }], -+ -+ ['TOSS_QUEUE_FE', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Stop per-draw execution at worker FE', -+ '', -+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], -+ }], -+ -+ ['TOSS_FETCH', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Stop per-draw execution at vertex fetch', -+ '', -+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], -+ }], -+ -+ ['TOSS_IA', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Stop per-draw execution at input assembler', -+ '', -+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], -+ }], -+ -+ ['TOSS_VS', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Stop per-draw execution at vertex shader', -+ '', -+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], -+ }], -+ -+ ['TOSS_SETUP_TRIS', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Stop per-draw execution at primitive setup', -+ '', -+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], -+ }], -+ -+ ['TOSS_BIN_TRIS', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Stop per-draw execution at primitive binning', -+ '', -+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], -+ }], -+ -+ ['TOSS_RS', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Stop per-draw execution at rasterizer', -+ '', -+ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], -+ }], -+ -+ ['WORKER_SPIN_LOOP_COUNT', { -+ 'type' : 'uint32_t', -+ 'default' : '5000', -+ 'desc' : ['Number of spin-loop iterations worker threads will perform', -+ 'before going to sleep when waiting for work'], -+ }], -+ -+ ['MAX_DRAWS_IN_FLIGHT', { -+ 'type' : 'uint32_t', -+ 'default' : '160', -+ 'desc' : ['Maximum number of draws outstanding before API thread blocks.'], -+ }], -+ -+ ['MAX_PRIMS_PER_DRAW', { -+ 'type' : 'uint32_t', -+ 'default' : '2040', -+ 'desc' : ['Maximum primitives in a single Draw().', -+ 'Larger primitives are split into smaller Draw calls.', -+ 'Should be a multiple of (3 * vectorWidth).'], -+ }], -+ -+ ['MAX_TESS_PRIMS_PER_DRAW', { -+ 'type' : 'uint32_t', -+ 'default' : '16', -+ 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.', -+ 'Larger primitives are split into smaller Draw calls.', -+ 'Should be a multiple of (vectorWidth).'], -+ }], -+ -+ ['MAX_FRAC_ODD_TESS_FACTOR', { -+ 'type' : 'float', -+ 'default' : '63.0f', -+ 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'], -+ }], -+ -+ ['MAX_FRAC_EVEN_TESS_FACTOR', { -+ 'type' : 'float', -+ 'default' : '64.0f', -+ 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'], -+ }], -+ -+ ['MAX_INTEGER_TESS_FACTOR', { -+ 'type' : 'uint32_t', -+ 'default' : '64', -+ 'desc' : ['(DEBUG) Maximum tessellation factor for integer partitioning.'], -+ }], -+ -+ ['DUMP_SHADER_IR', { -+ 'type' : 'bool', -+ 'default' : 'false', -+ 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'], -+ }], -+ -+ -+] -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py -new file mode 100644 -index 0000000..d963848 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py -@@ -0,0 +1,8 @@ -+# mako/__init__.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+ -+__version__ = '1.0.1' -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py -new file mode 100644 -index 0000000..efbc4fc ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py -@@ -0,0 +1,845 @@ -+# mako/_ast_util.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+""" -+ ast -+ ~~~ -+ -+ The `ast` module helps Python applications to process trees of the Python -+ abstract syntax grammar. The abstract syntax itself might change with -+ each Python release; this module helps to find out programmatically what -+ the current grammar looks like and allows modifications of it. -+ -+ An abstract syntax tree can be generated by passing `ast.PyCF_ONLY_AST` as -+ a flag to the `compile()` builtin function or by using the `parse()` -+ function from this module. The result will be a tree of objects whose -+ classes all inherit from `ast.AST`. -+ -+ A modified abstract syntax tree can be compiled into a Python code object -+ using the built-in `compile()` function. -+ -+ Additionally various helper functions are provided that make working with -+ the trees simpler. The main intention of the helper functions and this -+ module in general is to provide an easy to use interface for libraries -+ that work tightly with the python syntax (template engines for example). -+ -+ -+ :copyright: Copyright 2008 by Armin Ronacher. -+ :license: Python License. -+""" -+from _ast import * -+from mako.compat import arg_stringname -+ -+BOOLOP_SYMBOLS = { -+ And: 'and', -+ Or: 'or' -+} -+ -+BINOP_SYMBOLS = { -+ Add: '+', -+ Sub: '-', -+ Mult: '*', -+ Div: '/', -+ FloorDiv: '//', -+ Mod: '%', -+ LShift: '<<', -+ RShift: '>>', -+ BitOr: '|', -+ BitAnd: '&', -+ BitXor: '^' -+} -+ -+CMPOP_SYMBOLS = { -+ Eq: '==', -+ Gt: '>', -+ GtE: '>=', -+ In: 'in', -+ Is: 'is', -+ IsNot: 'is not', -+ Lt: '<', -+ LtE: '<=', -+ NotEq: '!=', -+ NotIn: 'not in' -+} -+ -+UNARYOP_SYMBOLS = { -+ Invert: '~', -+ Not: 'not', -+ UAdd: '+', -+ USub: '-' -+} -+ -+ALL_SYMBOLS = {} -+ALL_SYMBOLS.update(BOOLOP_SYMBOLS) -+ALL_SYMBOLS.update(BINOP_SYMBOLS) -+ALL_SYMBOLS.update(CMPOP_SYMBOLS) -+ALL_SYMBOLS.update(UNARYOP_SYMBOLS) -+ -+ -+def parse(expr, filename='', mode='exec'): -+ """Parse an expression into an AST node.""" -+ return compile(expr, filename, mode, PyCF_ONLY_AST) -+ -+ -+def to_source(node, indent_with=' ' * 4): -+ """ -+ This function can convert a node tree back into python sourcecode. This -+ is useful for debugging purposes, especially if you're dealing with custom -+ asts not generated by python itself. -+ -+ It could be that the sourcecode is evaluable when the AST itself is not -+ compilable / evaluable. The reason for this is that the AST contains some -+ more data than regular sourcecode does, which is dropped during -+ conversion. -+ -+ Each level of indentation is replaced with `indent_with`. Per default this -+ parameter is equal to four spaces as suggested by PEP 8, but it might be -+ adjusted to match the application's styleguide. -+ """ -+ generator = SourceGenerator(indent_with) -+ generator.visit(node) -+ return ''.join(generator.result) -+ -+ -+def dump(node): -+ """ -+ A very verbose representation of the node passed. This is useful for -+ debugging purposes. -+ """ -+ def _format(node): -+ if isinstance(node, AST): -+ return '%s(%s)' % (node.__class__.__name__, -+ ', '.join('%s=%s' % (a, _format(b)) -+ for a, b in iter_fields(node))) -+ elif isinstance(node, list): -+ return '[%s]' % ', '.join(_format(x) for x in node) -+ return repr(node) -+ if not isinstance(node, AST): -+ raise TypeError('expected AST, got %r' % node.__class__.__name__) -+ return _format(node) -+ -+ -+def copy_location(new_node, old_node): -+ """ -+ Copy the source location hint (`lineno` and `col_offset`) from the -+ old to the new node if possible and return the new one. -+ """ -+ for attr in 'lineno', 'col_offset': -+ if attr in old_node._attributes and attr in new_node._attributes \ -+ and hasattr(old_node, attr): -+ setattr(new_node, attr, getattr(old_node, attr)) -+ return new_node -+ -+ -+def fix_missing_locations(node): -+ """ -+ Some nodes require a line number and the column offset. Without that -+ information the compiler will abort the compilation. Because it can be -+ a dull task to add appropriate line numbers and column offsets when -+ adding new nodes this function can help. It copies the line number and -+ column offset of the parent node to the child nodes without this -+ information. -+ -+ Unlike `copy_location` this works recursive and won't touch nodes that -+ already have a location information. -+ """ -+ def _fix(node, lineno, col_offset): -+ if 'lineno' in node._attributes: -+ if not hasattr(node, 'lineno'): -+ node.lineno = lineno -+ else: -+ lineno = node.lineno -+ if 'col_offset' in node._attributes: -+ if not hasattr(node, 'col_offset'): -+ node.col_offset = col_offset -+ else: -+ col_offset = node.col_offset -+ for child in iter_child_nodes(node): -+ _fix(child, lineno, col_offset) -+ _fix(node, 1, 0) -+ return node -+ -+ -+def increment_lineno(node, n=1): -+ """ -+ Increment the line numbers of all nodes by `n` if they have line number -+ attributes. This is useful to "move code" to a different location in a -+ file. -+ """ -+ for node in zip((node,), walk(node)): -+ if 'lineno' in node._attributes: -+ node.lineno = getattr(node, 'lineno', 0) + n -+ -+ -+def iter_fields(node): -+ """Iterate over all fields of a node, only yielding existing fields.""" -+ # CPython 2.5 compat -+ if not hasattr(node, '_fields') or not node._fields: -+ return -+ for field in node._fields: -+ try: -+ yield field, getattr(node, field) -+ except AttributeError: -+ pass -+ -+ -+def get_fields(node): -+ """Like `iter_fiels` but returns a dict.""" -+ return dict(iter_fields(node)) -+ -+ -+def iter_child_nodes(node): -+ """Iterate over all child nodes or a node.""" -+ for name, field in iter_fields(node): -+ if isinstance(field, AST): -+ yield field -+ elif isinstance(field, list): -+ for item in field: -+ if isinstance(item, AST): -+ yield item -+ -+ -+def get_child_nodes(node): -+ """Like `iter_child_nodes` but returns a list.""" -+ return list(iter_child_nodes(node)) -+ -+ -+def get_compile_mode(node): -+ """ -+ Get the mode for `compile` of a given node. If the node is not a `mod` -+ node (`Expression`, `Module` etc.) a `TypeError` is thrown. -+ """ -+ if not isinstance(node, mod): -+ raise TypeError('expected mod node, got %r' % node.__class__.__name__) -+ return { -+ Expression: 'eval', -+ Interactive: 'single' -+ }.get(node.__class__, 'expr') -+ -+ -+def get_docstring(node): -+ """ -+ Return the docstring for the given node or `None` if no docstring can be -+ found. If the node provided does not accept docstrings a `TypeError` -+ will be raised. -+ """ -+ if not isinstance(node, (FunctionDef, ClassDef, Module)): -+ raise TypeError("%r can't have docstrings" % node.__class__.__name__) -+ if node.body and isinstance(node.body[0], Str): -+ return node.body[0].s -+ -+ -+def walk(node): -+ """ -+ Iterate over all nodes. This is useful if you only want to modify nodes in -+ place and don't care about the context or the order the nodes are returned. -+ """ -+ from collections import deque -+ todo = deque([node]) -+ while todo: -+ node = todo.popleft() -+ todo.extend(iter_child_nodes(node)) -+ yield node -+ -+ -+class NodeVisitor(object): -+ """ -+ Walks the abstract syntax tree and call visitor functions for every node -+ found. The visitor functions may return values which will be forwarded -+ by the `visit` method. -+ -+ Per default the visitor functions for the nodes are ``'visit_'`` + -+ class name of the node. So a `TryFinally` node visit function would -+ be `visit_TryFinally`. This behavior can be changed by overriding -+ the `get_visitor` function. If no visitor function exists for a node -+ (return value `None`) the `generic_visit` visitor is used instead. -+ -+ Don't use the `NodeVisitor` if you want to apply changes to nodes during -+ traversing. For this a special visitor exists (`NodeTransformer`) that -+ allows modifications. -+ """ -+ -+ def get_visitor(self, node): -+ """ -+ Return the visitor function for this node or `None` if no visitor -+ exists for this node. In that case the generic visit function is -+ used instead. -+ """ -+ method = 'visit_' + node.__class__.__name__ -+ return getattr(self, method, None) -+ -+ def visit(self, node): -+ """Visit a node.""" -+ f = self.get_visitor(node) -+ if f is not None: -+ return f(node) -+ return self.generic_visit(node) -+ -+ def generic_visit(self, node): -+ """Called if no explicit visitor function exists for a node.""" -+ for field, value in iter_fields(node): -+ if isinstance(value, list): -+ for item in value: -+ if isinstance(item, AST): -+ self.visit(item) -+ elif isinstance(value, AST): -+ self.visit(value) -+ -+ -+class NodeTransformer(NodeVisitor): -+ """ -+ Walks the abstract syntax tree and allows modifications of nodes. -+ -+ The `NodeTransformer` will walk the AST and use the return value of the -+ visitor functions to replace or remove the old node. If the return -+ value of the visitor function is `None` the node will be removed -+ from the previous location otherwise it's replaced with the return -+ value. The return value may be the original node in which case no -+ replacement takes place. -+ -+ Here an example transformer that rewrites all `foo` to `data['foo']`:: -+ -+ class RewriteName(NodeTransformer): -+ -+ def visit_Name(self, node): -+ return copy_location(Subscript( -+ value=Name(id='data', ctx=Load()), -+ slice=Index(value=Str(s=node.id)), -+ ctx=node.ctx -+ ), node) -+ -+ Keep in mind that if the node you're operating on has child nodes -+ you must either transform the child nodes yourself or call the generic -+ visit function for the node first. -+ -+ Nodes that were part of a collection of statements (that applies to -+ all statement nodes) may also return a list of nodes rather than just -+ a single node. -+ -+ Usually you use the transformer like this:: -+ -+ node = YourTransformer().visit(node) -+ """ -+ -+ def generic_visit(self, node): -+ for field, old_value in iter_fields(node): -+ old_value = getattr(node, field, None) -+ if isinstance(old_value, list): -+ new_values = [] -+ for value in old_value: -+ if isinstance(value, AST): -+ value = self.visit(value) -+ if value is None: -+ continue -+ elif not isinstance(value, AST): -+ new_values.extend(value) -+ continue -+ new_values.append(value) -+ old_value[:] = new_values -+ elif isinstance(old_value, AST): -+ new_node = self.visit(old_value) -+ if new_node is None: -+ delattr(node, field) -+ else: -+ setattr(node, field, new_node) -+ return node -+ -+ -+class SourceGenerator(NodeVisitor): -+ """ -+ This visitor is able to transform a well formed syntax tree into python -+ sourcecode. For more details have a look at the docstring of the -+ `node_to_source` function. -+ """ -+ -+ def __init__(self, indent_with): -+ self.result = [] -+ self.indent_with = indent_with -+ self.indentation = 0 -+ self.new_lines = 0 -+ -+ def write(self, x): -+ if self.new_lines: -+ if self.result: -+ self.result.append('\n' * self.new_lines) -+ self.result.append(self.indent_with * self.indentation) -+ self.new_lines = 0 -+ self.result.append(x) -+ -+ def newline(self, n=1): -+ self.new_lines = max(self.new_lines, n) -+ -+ def body(self, statements): -+ self.new_line = True -+ self.indentation += 1 -+ for stmt in statements: -+ self.visit(stmt) -+ self.indentation -= 1 -+ -+ def body_or_else(self, node): -+ self.body(node.body) -+ if node.orelse: -+ self.newline() -+ self.write('else:') -+ self.body(node.orelse) -+ -+ def signature(self, node): -+ want_comma = [] -+ def write_comma(): -+ if want_comma: -+ self.write(', ') -+ else: -+ want_comma.append(True) -+ -+ padding = [None] * (len(node.args) - len(node.defaults)) -+ for arg, default in zip(node.args, padding + node.defaults): -+ write_comma() -+ self.visit(arg) -+ if default is not None: -+ self.write('=') -+ self.visit(default) -+ if node.vararg is not None: -+ write_comma() -+ self.write('*' + arg_stringname(node.vararg)) -+ if node.kwarg is not None: -+ write_comma() -+ self.write('**' + arg_stringname(node.kwarg)) -+ -+ def decorators(self, node): -+ for decorator in node.decorator_list: -+ self.newline() -+ self.write('@') -+ self.visit(decorator) -+ -+ # Statements -+ -+ def visit_Assign(self, node): -+ self.newline() -+ for idx, target in enumerate(node.targets): -+ if idx: -+ self.write(', ') -+ self.visit(target) -+ self.write(' = ') -+ self.visit(node.value) -+ -+ def visit_AugAssign(self, node): -+ self.newline() -+ self.visit(node.target) -+ self.write(BINOP_SYMBOLS[type(node.op)] + '=') -+ self.visit(node.value) -+ -+ def visit_ImportFrom(self, node): -+ self.newline() -+ self.write('from %s%s import ' % ('.' * node.level, node.module)) -+ for idx, item in enumerate(node.names): -+ if idx: -+ self.write(', ') -+ self.write(item) -+ -+ def visit_Import(self, node): -+ self.newline() -+ for item in node.names: -+ self.write('import ') -+ self.visit(item) -+ -+ def visit_Expr(self, node): -+ self.newline() -+ self.generic_visit(node) -+ -+ def visit_FunctionDef(self, node): -+ self.newline(n=2) -+ self.decorators(node) -+ self.newline() -+ self.write('def %s(' % node.name) -+ self.signature(node.args) -+ self.write('):') -+ self.body(node.body) -+ -+ def visit_ClassDef(self, node): -+ have_args = [] -+ def paren_or_comma(): -+ if have_args: -+ self.write(', ') -+ else: -+ have_args.append(True) -+ self.write('(') -+ -+ self.newline(n=3) -+ self.decorators(node) -+ self.newline() -+ self.write('class %s' % node.name) -+ for base in node.bases: -+ paren_or_comma() -+ self.visit(base) -+ # XXX: the if here is used to keep this module compatible -+ # with python 2.6. -+ if hasattr(node, 'keywords'): -+ for keyword in node.keywords: -+ paren_or_comma() -+ self.write(keyword.arg + '=') -+ self.visit(keyword.value) -+ if node.starargs is not None: -+ paren_or_comma() -+ self.write('*') -+ self.visit(node.starargs) -+ if node.kwargs is not None: -+ paren_or_comma() -+ self.write('**') -+ self.visit(node.kwargs) -+ self.write(have_args and '):' or ':') -+ self.body(node.body) -+ -+ def visit_If(self, node): -+ self.newline() -+ self.write('if ') -+ self.visit(node.test) -+ self.write(':') -+ self.body(node.body) -+ while True: -+ else_ = node.orelse -+ if len(else_) == 1 and isinstance(else_[0], If): -+ node = else_[0] -+ self.newline() -+ self.write('elif ') -+ self.visit(node.test) -+ self.write(':') -+ self.body(node.body) -+ else: -+ self.newline() -+ self.write('else:') -+ self.body(else_) -+ break -+ -+ def visit_For(self, node): -+ self.newline() -+ self.write('for ') -+ self.visit(node.target) -+ self.write(' in ') -+ self.visit(node.iter) -+ self.write(':') -+ self.body_or_else(node) -+ -+ def visit_While(self, node): -+ self.newline() -+ self.write('while ') -+ self.visit(node.test) -+ self.write(':') -+ self.body_or_else(node) -+ -+ def visit_With(self, node): -+ self.newline() -+ self.write('with ') -+ self.visit(node.context_expr) -+ if node.optional_vars is not None: -+ self.write(' as ') -+ self.visit(node.optional_vars) -+ self.write(':') -+ self.body(node.body) -+ -+ def visit_Pass(self, node): -+ self.newline() -+ self.write('pass') -+ -+ def visit_Print(self, node): -+ # XXX: python 2.6 only -+ self.newline() -+ self.write('print ') -+ want_comma = False -+ if node.dest is not None: -+ self.write(' >> ') -+ self.visit(node.dest) -+ want_comma = True -+ for value in node.values: -+ if want_comma: -+ self.write(', ') -+ self.visit(value) -+ want_comma = True -+ if not node.nl: -+ self.write(',') -+ -+ def visit_Delete(self, node): -+ self.newline() -+ self.write('del ') -+ for idx, target in enumerate(node): -+ if idx: -+ self.write(', ') -+ self.visit(target) -+ -+ def visit_TryExcept(self, node): -+ self.newline() -+ self.write('try:') -+ self.body(node.body) -+ for handler in node.handlers: -+ self.visit(handler) -+ -+ def visit_TryFinally(self, node): -+ self.newline() -+ self.write('try:') -+ self.body(node.body) -+ self.newline() -+ self.write('finally:') -+ self.body(node.finalbody) -+ -+ def visit_Global(self, node): -+ self.newline() -+ self.write('global ' + ', '.join(node.names)) -+ -+ def visit_Nonlocal(self, node): -+ self.newline() -+ self.write('nonlocal ' + ', '.join(node.names)) -+ -+ def visit_Return(self, node): -+ self.newline() -+ self.write('return ') -+ self.visit(node.value) -+ -+ def visit_Break(self, node): -+ self.newline() -+ self.write('break') -+ -+ def visit_Continue(self, node): -+ self.newline() -+ self.write('continue') -+ -+ def visit_Raise(self, node): -+ # XXX: Python 2.6 / 3.0 compatibility -+ self.newline() -+ self.write('raise') -+ if hasattr(node, 'exc') and node.exc is not None: -+ self.write(' ') -+ self.visit(node.exc) -+ if node.cause is not None: -+ self.write(' from ') -+ self.visit(node.cause) -+ elif hasattr(node, 'type') and node.type is not None: -+ self.visit(node.type) -+ if node.inst is not None: -+ self.write(', ') -+ self.visit(node.inst) -+ if node.tback is not None: -+ self.write(', ') -+ self.visit(node.tback) -+ -+ # Expressions -+ -+ def visit_Attribute(self, node): -+ self.visit(node.value) -+ self.write('.' + node.attr) -+ -+ def visit_Call(self, node): -+ want_comma = [] -+ def write_comma(): -+ if want_comma: -+ self.write(', ') -+ else: -+ want_comma.append(True) -+ -+ self.visit(node.func) -+ self.write('(') -+ for arg in node.args: -+ write_comma() -+ self.visit(arg) -+ for keyword in node.keywords: -+ write_comma() -+ self.write(keyword.arg + '=') -+ self.visit(keyword.value) -+ if node.starargs is not None: -+ write_comma() -+ self.write('*') -+ self.visit(node.starargs) -+ if node.kwargs is not None: -+ write_comma() -+ self.write('**') -+ self.visit(node.kwargs) -+ self.write(')') -+ -+ def visit_Name(self, node): -+ self.write(node.id) -+ -+ def visit_NameConstant(self, node): -+ self.write(str(node.value)) -+ -+ def visit_arg(self, node): -+ self.write(node.arg) -+ -+ def visit_Str(self, node): -+ self.write(repr(node.s)) -+ -+ def visit_Bytes(self, node): -+ self.write(repr(node.s)) -+ -+ def visit_Num(self, node): -+ self.write(repr(node.n)) -+ -+ def visit_Tuple(self, node): -+ self.write('(') -+ idx = -1 -+ for idx, item in enumerate(node.elts): -+ if idx: -+ self.write(', ') -+ self.visit(item) -+ self.write(idx and ')' or ',)') -+ -+ def sequence_visit(left, right): -+ def visit(self, node): -+ self.write(left) -+ for idx, item in enumerate(node.elts): -+ if idx: -+ self.write(', ') -+ self.visit(item) -+ self.write(right) -+ return visit -+ -+ visit_List = sequence_visit('[', ']') -+ visit_Set = sequence_visit('{', '}') -+ del sequence_visit -+ -+ def visit_Dict(self, node): -+ self.write('{') -+ for idx, (key, value) in enumerate(zip(node.keys, node.values)): -+ if idx: -+ self.write(', ') -+ self.visit(key) -+ self.write(': ') -+ self.visit(value) -+ self.write('}') -+ -+ def visit_BinOp(self, node): -+ self.write('(') -+ self.visit(node.left) -+ self.write(' %s ' % BINOP_SYMBOLS[type(node.op)]) -+ self.visit(node.right) -+ self.write(')') -+ -+ def visit_BoolOp(self, node): -+ self.write('(') -+ for idx, value in enumerate(node.values): -+ if idx: -+ self.write(' %s ' % BOOLOP_SYMBOLS[type(node.op)]) -+ self.visit(value) -+ self.write(')') -+ -+ def visit_Compare(self, node): -+ self.write('(') -+ self.visit(node.left) -+ for op, right in zip(node.ops, node.comparators): -+ self.write(' %s ' % CMPOP_SYMBOLS[type(op)]) -+ self.visit(right) -+ self.write(')') -+ -+ def visit_UnaryOp(self, node): -+ self.write('(') -+ op = UNARYOP_SYMBOLS[type(node.op)] -+ self.write(op) -+ if op == 'not': -+ self.write(' ') -+ self.visit(node.operand) -+ self.write(')') -+ -+ def visit_Subscript(self, node): -+ self.visit(node.value) -+ self.write('[') -+ self.visit(node.slice) -+ self.write(']') -+ -+ def visit_Slice(self, node): -+ if node.lower is not None: -+ self.visit(node.lower) -+ self.write(':') -+ if node.upper is not None: -+ self.visit(node.upper) -+ if node.step is not None: -+ self.write(':') -+ if not (isinstance(node.step, Name) and node.step.id == 'None'): -+ self.visit(node.step) -+ -+ def visit_ExtSlice(self, node): -+ for idx, item in node.dims: -+ if idx: -+ self.write(', ') -+ self.visit(item) -+ -+ def visit_Yield(self, node): -+ self.write('yield ') -+ self.visit(node.value) -+ -+ def visit_Lambda(self, node): -+ self.write('lambda ') -+ self.signature(node.args) -+ self.write(': ') -+ self.visit(node.body) -+ -+ def visit_Ellipsis(self, node): -+ self.write('Ellipsis') -+ -+ def generator_visit(left, right): -+ def visit(self, node): -+ self.write(left) -+ self.visit(node.elt) -+ for comprehension in node.generators: -+ self.visit(comprehension) -+ self.write(right) -+ return visit -+ -+ visit_ListComp = generator_visit('[', ']') -+ visit_GeneratorExp = generator_visit('(', ')') -+ visit_SetComp = generator_visit('{', '}') -+ del generator_visit -+ -+ def visit_DictComp(self, node): -+ self.write('{') -+ self.visit(node.key) -+ self.write(': ') -+ self.visit(node.value) -+ for comprehension in node.generators: -+ self.visit(comprehension) -+ self.write('}') -+ -+ def visit_IfExp(self, node): -+ self.visit(node.body) -+ self.write(' if ') -+ self.visit(node.test) -+ self.write(' else ') -+ self.visit(node.orelse) -+ -+ def visit_Starred(self, node): -+ self.write('*') -+ self.visit(node.value) -+ -+ def visit_Repr(self, node): -+ # XXX: python 2.6 only -+ self.write('`') -+ self.visit(node.value) -+ self.write('`') -+ -+ # Helper Nodes -+ -+ def visit_alias(self, node): -+ self.write(node.name) -+ if node.asname is not None: -+ self.write(' as ' + node.asname) -+ -+ def visit_comprehension(self, node): -+ self.write(' for ') -+ self.visit(node.target) -+ self.write(' in ') -+ self.visit(node.iter) -+ if node.ifs: -+ for if_ in node.ifs: -+ self.write(' if ') -+ self.visit(if_) -+ -+ def visit_excepthandler(self, node): -+ self.newline() -+ self.write('except') -+ if node.type is not None: -+ self.write(' ') -+ self.visit(node.type) -+ if node.name is not None: -+ self.write(' as ') -+ self.visit(node.name) -+ self.write(':') -+ self.body(node.body) -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py -new file mode 100644 -index 0000000..65fd84d ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py -@@ -0,0 +1,178 @@ -+# mako/ast.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+"""utilities for analyzing expressions and blocks of Python -+code, as well as generating Python from AST nodes""" -+ -+from mako import exceptions, pyparser, compat -+import re -+ -+class PythonCode(object): -+ """represents information about a string containing Python code""" -+ def __init__(self, code, **exception_kwargs): -+ self.code = code -+ -+ # represents all identifiers which are assigned to at some point in -+ # the code -+ self.declared_identifiers = set() -+ -+ # represents all identifiers which are referenced before their -+ # assignment, if any -+ self.undeclared_identifiers = set() -+ -+ # note that an identifier can be in both the undeclared and declared -+ # lists. -+ -+ # using AST to parse instead of using code.co_varnames, -+ # code.co_names has several advantages: -+ # - we can locate an identifier as "undeclared" even if -+ # its declared later in the same block of code -+ # - AST is less likely to break with version changes -+ # (for example, the behavior of co_names changed a little bit -+ # in python version 2.5) -+ if isinstance(code, compat.string_types): -+ expr = pyparser.parse(code.lstrip(), "exec", **exception_kwargs) -+ else: -+ expr = code -+ -+ f = pyparser.FindIdentifiers(self, **exception_kwargs) -+ f.visit(expr) -+ -+class ArgumentList(object): -+ """parses a fragment of code as a comma-separated list of expressions""" -+ def __init__(self, code, **exception_kwargs): -+ self.codeargs = [] -+ self.args = [] -+ self.declared_identifiers = set() -+ self.undeclared_identifiers = set() -+ if isinstance(code, compat.string_types): -+ if re.match(r"\S", code) and not re.match(r",\s*$", code): -+ # if theres text and no trailing comma, insure its parsed -+ # as a tuple by adding a trailing comma -+ code += "," -+ expr = pyparser.parse(code, "exec", **exception_kwargs) -+ else: -+ expr = code -+ -+ f = pyparser.FindTuple(self, PythonCode, **exception_kwargs) -+ f.visit(expr) -+ -+class PythonFragment(PythonCode): -+ """extends PythonCode to provide identifier lookups in partial control -+ statements -+ -+ e.g. -+ for x in 5: -+ elif y==9: -+ except (MyException, e): -+ etc. -+ """ -+ def __init__(self, code, **exception_kwargs): -+ m = re.match(r'^(\w+)(?:\s+(.*?))?:\s*(#|$)', code.strip(), re.S) -+ if not m: -+ raise exceptions.CompileException( -+ "Fragment '%s' is not a partial control statement" % -+ code, **exception_kwargs) -+ if m.group(3): -+ code = code[:m.start(3)] -+ (keyword, expr) = m.group(1,2) -+ if keyword in ['for','if', 'while']: -+ code = code + "pass" -+ elif keyword == 'try': -+ code = code + "pass\nexcept:pass" -+ elif keyword == 'elif' or keyword == 'else': -+ code = "if False:pass\n" + code + "pass" -+ elif keyword == 'except': -+ code = "try:pass\n" + code + "pass" -+ elif keyword == 'with': -+ code = code + "pass" -+ else: -+ raise exceptions.CompileException( -+ "Unsupported control keyword: '%s'" % -+ keyword, **exception_kwargs) -+ super(PythonFragment, self).__init__(code, **exception_kwargs) -+ -+ -+class FunctionDecl(object): -+ """function declaration""" -+ def __init__(self, code, allow_kwargs=True, **exception_kwargs): -+ self.code = code -+ expr = pyparser.parse(code, "exec", **exception_kwargs) -+ -+ f = pyparser.ParseFunc(self, **exception_kwargs) -+ f.visit(expr) -+ if not hasattr(self, 'funcname'): -+ raise exceptions.CompileException( -+ "Code '%s' is not a function declaration" % code, -+ **exception_kwargs) -+ if not allow_kwargs and self.kwargs: -+ raise exceptions.CompileException( -+ "'**%s' keyword argument not allowed here" % -+ self.kwargnames[-1], **exception_kwargs) -+ -+ def get_argument_expressions(self, as_call=False): -+ """Return the argument declarations of this FunctionDecl as a printable -+ list. -+ -+ By default the return value is appropriate for writing in a ``def``; -+ set `as_call` to true to build arguments to be passed to the function -+ instead (assuming locals with the same names as the arguments exist). -+ """ -+ -+ namedecls = [] -+ -+ # Build in reverse order, since defaults and slurpy args come last -+ argnames = self.argnames[::-1] -+ kwargnames = self.kwargnames[::-1] -+ defaults = self.defaults[::-1] -+ kwdefaults = self.kwdefaults[::-1] -+ -+ # Named arguments -+ if self.kwargs: -+ namedecls.append("**" + kwargnames.pop(0)) -+ -+ for name in kwargnames: -+ # Keyword-only arguments must always be used by name, so even if -+ # this is a call, print out `foo=foo` -+ if as_call: -+ namedecls.append("%s=%s" % (name, name)) -+ elif kwdefaults: -+ default = kwdefaults.pop(0) -+ if default is None: -+ # The AST always gives kwargs a default, since you can do -+ # `def foo(*, a=1, b, c=3)` -+ namedecls.append(name) -+ else: -+ namedecls.append("%s=%s" % ( -+ name, pyparser.ExpressionGenerator(default).value())) -+ else: -+ namedecls.append(name) -+ -+ # Positional arguments -+ if self.varargs: -+ namedecls.append("*" + argnames.pop(0)) -+ -+ for name in argnames: -+ if as_call or not defaults: -+ namedecls.append(name) -+ else: -+ default = defaults.pop(0) -+ namedecls.append("%s=%s" % ( -+ name, pyparser.ExpressionGenerator(default).value())) -+ -+ namedecls.reverse() -+ return namedecls -+ -+ @property -+ def allargnames(self): -+ return tuple(self.argnames) + tuple(self.kwargnames) -+ -+class FunctionArgs(FunctionDecl): -+ """the argument portion of a function declaration""" -+ -+ def __init__(self, code, **kwargs): -+ super(FunctionArgs, self).__init__("def ANON(%s):pass" % code, -+ **kwargs) -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py -new file mode 100644 -index 0000000..c405c51 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py -@@ -0,0 +1,238 @@ -+# mako/cache.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+from mako import compat, util -+ -+_cache_plugins = util.PluginLoader("mako.cache") -+ -+register_plugin = _cache_plugins.register -+register_plugin("beaker", "mako.ext.beaker_cache", "BeakerCacheImpl") -+ -+ -+class Cache(object): -+ """Represents a data content cache made available to the module -+ space of a specific :class:`.Template` object. -+ -+ .. versionadded:: 0.6 -+ :class:`.Cache` by itself is mostly a -+ container for a :class:`.CacheImpl` object, which implements -+ a fixed API to provide caching services; specific subclasses exist to -+ implement different -+ caching strategies. Mako includes a backend that works with -+ the Beaker caching system. Beaker itself then supports -+ a number of backends (i.e. file, memory, memcached, etc.) -+ -+ The construction of a :class:`.Cache` is part of the mechanics -+ of a :class:`.Template`, and programmatic access to this -+ cache is typically via the :attr:`.Template.cache` attribute. -+ -+ """ -+ -+ impl = None -+ """Provide the :class:`.CacheImpl` in use by this :class:`.Cache`. -+ -+ This accessor allows a :class:`.CacheImpl` with additional -+ methods beyond that of :class:`.Cache` to be used programmatically. -+ -+ """ -+ -+ id = None -+ """Return the 'id' that identifies this cache. -+ -+ This is a value that should be globally unique to the -+ :class:`.Template` associated with this cache, and can -+ be used by a caching system to name a local container -+ for data specific to this template. -+ -+ """ -+ -+ starttime = None -+ """Epochal time value for when the owning :class:`.Template` was -+ first compiled. -+ -+ A cache implementation may wish to invalidate data earlier than -+ this timestamp; this has the effect of the cache for a specific -+ :class:`.Template` starting clean any time the :class:`.Template` -+ is recompiled, such as when the original template file changed on -+ the filesystem. -+ -+ """ -+ -+ def __init__(self, template, *args): -+ # check for a stale template calling the -+ # constructor -+ if isinstance(template, compat.string_types) and args: -+ return -+ self.template = template -+ self.id = template.module.__name__ -+ self.starttime = template.module._modified_time -+ self._def_regions = {} -+ self.impl = self._load_impl(self.template.cache_impl) -+ -+ def _load_impl(self, name): -+ return _cache_plugins.load(name)(self) -+ -+ def get_or_create(self, key, creation_function, **kw): -+ """Retrieve a value from the cache, using the given creation function -+ to generate a new value.""" -+ -+ return self._ctx_get_or_create(key, creation_function, None, **kw) -+ -+ def _ctx_get_or_create(self, key, creation_function, context, **kw): -+ """Retrieve a value from the cache, using the given creation function -+ to generate a new value.""" -+ -+ if not self.template.cache_enabled: -+ return creation_function() -+ -+ return self.impl.get_or_create( -+ key, -+ creation_function, -+ **self._get_cache_kw(kw, context)) -+ -+ def set(self, key, value, **kw): -+ """Place a value in the cache. -+ -+ :param key: the value's key. -+ :param value: the value. -+ :param \**kw: cache configuration arguments. -+ -+ """ -+ -+ self.impl.set(key, value, **self._get_cache_kw(kw, None)) -+ -+ put = set -+ """A synonym for :meth:`.Cache.set`. -+ -+ This is here for backwards compatibility. -+ -+ """ -+ -+ def get(self, key, **kw): -+ """Retrieve a value from the cache. -+ -+ :param key: the value's key. -+ :param \**kw: cache configuration arguments. The -+ backend is configured using these arguments upon first request. -+ Subsequent requests that use the same series of configuration -+ values will use that same backend. -+ -+ """ -+ return self.impl.get(key, **self._get_cache_kw(kw, None)) -+ -+ def invalidate(self, key, **kw): -+ """Invalidate a value in the cache. -+ -+ :param key: the value's key. -+ :param \**kw: cache configuration arguments. The -+ backend is configured using these arguments upon first request. -+ Subsequent requests that use the same series of configuration -+ values will use that same backend. -+ -+ """ -+ self.impl.invalidate(key, **self._get_cache_kw(kw, None)) -+ -+ def invalidate_body(self): -+ """Invalidate the cached content of the "body" method for this -+ template. -+ -+ """ -+ self.invalidate('render_body', __M_defname='render_body') -+ -+ def invalidate_def(self, name): -+ """Invalidate the cached content of a particular ``<%def>`` within this -+ template. -+ -+ """ -+ -+ self.invalidate('render_%s' % name, __M_defname='render_%s' % name) -+ -+ def invalidate_closure(self, name): -+ """Invalidate a nested ``<%def>`` within this template. -+ -+ Caching of nested defs is a blunt tool as there is no -+ management of scope -- nested defs that use cache tags -+ need to have names unique of all other nested defs in the -+ template, else their content will be overwritten by -+ each other. -+ -+ """ -+ -+ self.invalidate(name, __M_defname=name) -+ -+ def _get_cache_kw(self, kw, context): -+ defname = kw.pop('__M_defname', None) -+ if not defname: -+ tmpl_kw = self.template.cache_args.copy() -+ tmpl_kw.update(kw) -+ elif defname in self._def_regions: -+ tmpl_kw = self._def_regions[defname] -+ else: -+ tmpl_kw = self.template.cache_args.copy() -+ tmpl_kw.update(kw) -+ self._def_regions[defname] = tmpl_kw -+ if context and self.impl.pass_context: -+ tmpl_kw = tmpl_kw.copy() -+ tmpl_kw.setdefault('context', context) -+ return tmpl_kw -+ -+ -+class CacheImpl(object): -+ """Provide a cache implementation for use by :class:`.Cache`.""" -+ -+ def __init__(self, cache): -+ self.cache = cache -+ -+ pass_context = False -+ """If ``True``, the :class:`.Context` will be passed to -+ :meth:`get_or_create <.CacheImpl.get_or_create>` as the name ``'context'``. -+ """ -+ -+ def get_or_create(self, key, creation_function, **kw): -+ """Retrieve a value from the cache, using the given creation function -+ to generate a new value. -+ -+ This function *must* return a value, either from -+ the cache, or via the given creation function. -+ If the creation function is called, the newly -+ created value should be populated into the cache -+ under the given key before being returned. -+ -+ :param key: the value's key. -+ :param creation_function: function that when called generates -+ a new value. -+ :param \**kw: cache configuration arguments. -+ -+ """ -+ raise NotImplementedError() -+ -+ def set(self, key, value, **kw): -+ """Place a value in the cache. -+ -+ :param key: the value's key. -+ :param value: the value. -+ :param \**kw: cache configuration arguments. -+ -+ """ -+ raise NotImplementedError() -+ -+ def get(self, key, **kw): -+ """Retrieve a value from the cache. -+ -+ :param key: the value's key. -+ :param \**kw: cache configuration arguments. -+ -+ """ -+ raise NotImplementedError() -+ -+ def invalidate(self, key, **kw): -+ """Invalidate a value in the cache. -+ -+ :param key: the value's key. -+ :param \**kw: cache configuration arguments. -+ -+ """ -+ raise NotImplementedError() -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py -new file mode 100644 -index 0000000..1a9ca56 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py -@@ -0,0 +1,62 @@ -+# mako/cmd.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+from argparse import ArgumentParser -+from os.path import isfile, dirname -+import sys -+from mako.template import Template -+from mako.lookup import TemplateLookup -+from mako import exceptions -+ -+def varsplit(var): -+ if "=" not in var: -+ return (var, "") -+ return var.split("=", 1) -+ -+def _exit(): -+ sys.stderr.write(exceptions.text_error_template().render()) -+ sys.exit(1) -+ -+def cmdline(argv=None): -+ -+ parser = ArgumentParser("usage: %prog [FILENAME]") -+ parser.add_argument("--var", default=[], action="append", -+ help="variable (can be used multiple times, use name=value)") -+ parser.add_argument("--template-dir", default=[], action="append", -+ help="Directory to use for template lookup (multiple " -+ "directories may be provided). If not given then if the " -+ "template is read from stdin, the value defaults to be " -+ "the current directory, otherwise it defaults to be the " -+ "parent directory of the file provided.") -+ parser.add_argument('input', nargs='?', default='-') -+ -+ options = parser.parse_args(argv) -+ if options.input == '-': -+ lookup_dirs = options.template_dir or ["."] -+ lookup = TemplateLookup(lookup_dirs) -+ try: -+ template = Template(sys.stdin.read(), lookup=lookup) -+ except: -+ _exit() -+ else: -+ filename = options.input -+ if not isfile(filename): -+ raise SystemExit("error: can't find %s" % filename) -+ lookup_dirs = options.template_dir or [dirname(filename)] -+ lookup = TemplateLookup(lookup_dirs) -+ try: -+ template = Template(filename=filename, lookup=lookup) -+ except: -+ _exit() -+ -+ kw = dict([varsplit(var) for var in options.var]) -+ try: -+ print(template.render(**kw)) -+ except: -+ _exit() -+ -+ -+if __name__ == "__main__": -+ cmdline() -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py -new file mode 100644 -index 0000000..4b0bda8 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py -@@ -0,0 +1,1237 @@ -+# mako/codegen.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+"""provides functionality for rendering a parsetree constructing into module -+source code.""" -+ -+import time -+import re -+from mako.pygen import PythonPrinter -+from mako import util, ast, parsetree, filters, exceptions -+from mako import compat -+ -+ -+MAGIC_NUMBER = 10 -+ -+# names which are hardwired into the -+# template and are not accessed via the -+# context itself -+RESERVED_NAMES = set(['context', 'loop', 'UNDEFINED']) -+ -+def compile(node, -+ uri, -+ filename=None, -+ default_filters=None, -+ buffer_filters=None, -+ imports=None, -+ future_imports=None, -+ source_encoding=None, -+ generate_magic_comment=True, -+ disable_unicode=False, -+ strict_undefined=False, -+ enable_loop=True, -+ reserved_names=frozenset()): -+ -+ """Generate module source code given a parsetree node, -+ uri, and optional source filename""" -+ -+ # if on Py2K, push the "source_encoding" string to be -+ # a bytestring itself, as we will be embedding it into -+ # the generated source and we don't want to coerce the -+ # result into a unicode object, in "disable_unicode" mode -+ if not compat.py3k and isinstance(source_encoding, compat.text_type): -+ source_encoding = source_encoding.encode(source_encoding) -+ -+ -+ buf = util.FastEncodingBuffer() -+ -+ printer = PythonPrinter(buf) -+ _GenerateRenderMethod(printer, -+ _CompileContext(uri, -+ filename, -+ default_filters, -+ buffer_filters, -+ imports, -+ future_imports, -+ source_encoding, -+ generate_magic_comment, -+ disable_unicode, -+ strict_undefined, -+ enable_loop, -+ reserved_names), -+ node) -+ return buf.getvalue() -+ -+class _CompileContext(object): -+ def __init__(self, -+ uri, -+ filename, -+ default_filters, -+ buffer_filters, -+ imports, -+ future_imports, -+ source_encoding, -+ generate_magic_comment, -+ disable_unicode, -+ strict_undefined, -+ enable_loop, -+ reserved_names): -+ self.uri = uri -+ self.filename = filename -+ self.default_filters = default_filters -+ self.buffer_filters = buffer_filters -+ self.imports = imports -+ self.future_imports = future_imports -+ self.source_encoding = source_encoding -+ self.generate_magic_comment = generate_magic_comment -+ self.disable_unicode = disable_unicode -+ self.strict_undefined = strict_undefined -+ self.enable_loop = enable_loop -+ self.reserved_names = reserved_names -+ -+class _GenerateRenderMethod(object): -+ """A template visitor object which generates the -+ full module source for a template. -+ -+ """ -+ def __init__(self, printer, compiler, node): -+ self.printer = printer -+ self.compiler = compiler -+ self.node = node -+ self.identifier_stack = [None] -+ self.in_def = isinstance(node, (parsetree.DefTag, parsetree.BlockTag)) -+ -+ if self.in_def: -+ name = "render_%s" % node.funcname -+ args = node.get_argument_expressions() -+ filtered = len(node.filter_args.args) > 0 -+ buffered = eval(node.attributes.get('buffered', 'False')) -+ cached = eval(node.attributes.get('cached', 'False')) -+ defs = None -+ pagetag = None -+ if node.is_block and not node.is_anonymous: -+ args += ['**pageargs'] -+ else: -+ defs = self.write_toplevel() -+ pagetag = self.compiler.pagetag -+ name = "render_body" -+ if pagetag is not None: -+ args = pagetag.body_decl.get_argument_expressions() -+ if not pagetag.body_decl.kwargs: -+ args += ['**pageargs'] -+ cached = eval(pagetag.attributes.get('cached', 'False')) -+ self.compiler.enable_loop = self.compiler.enable_loop or eval( -+ pagetag.attributes.get( -+ 'enable_loop', 'False') -+ ) -+ else: -+ args = ['**pageargs'] -+ cached = False -+ buffered = filtered = False -+ if args is None: -+ args = ['context'] -+ else: -+ args = [a for a in ['context'] + args] -+ -+ self.write_render_callable( -+ pagetag or node, -+ name, args, -+ buffered, filtered, cached) -+ -+ if defs is not None: -+ for node in defs: -+ _GenerateRenderMethod(printer, compiler, node) -+ -+ if not self.in_def: -+ self.write_metadata_struct() -+ -+ def write_metadata_struct(self): -+ self.printer.source_map[self.printer.lineno] = \ -+ max(self.printer.source_map) -+ struct = { -+ "filename": self.compiler.filename, -+ "uri": self.compiler.uri, -+ "source_encoding": self.compiler.source_encoding, -+ "line_map": self.printer.source_map, -+ } -+ self.printer.writelines( -+ '"""', -+ '__M_BEGIN_METADATA', -+ compat.json.dumps(struct), -+ '__M_END_METADATA\n' -+ '"""' -+ ) -+ -+ @property -+ def identifiers(self): -+ return self.identifier_stack[-1] -+ -+ def write_toplevel(self): -+ """Traverse a template structure for module-level directives and -+ generate the start of module-level code. -+ -+ """ -+ inherit = [] -+ namespaces = {} -+ module_code = [] -+ -+ self.compiler.pagetag = None -+ -+ class FindTopLevel(object): -+ def visitInheritTag(s, node): -+ inherit.append(node) -+ def visitNamespaceTag(s, node): -+ namespaces[node.name] = node -+ def visitPageTag(s, node): -+ self.compiler.pagetag = node -+ def visitCode(s, node): -+ if node.ismodule: -+ module_code.append(node) -+ -+ f = FindTopLevel() -+ for n in self.node.nodes: -+ n.accept_visitor(f) -+ -+ self.compiler.namespaces = namespaces -+ -+ module_ident = set() -+ for n in module_code: -+ module_ident = module_ident.union(n.declared_identifiers()) -+ -+ module_identifiers = _Identifiers(self.compiler) -+ module_identifiers.declared = module_ident -+ -+ # module-level names, python code -+ if self.compiler.generate_magic_comment and \ -+ self.compiler.source_encoding: -+ self.printer.writeline("# -*- coding:%s -*-" % -+ self.compiler.source_encoding) -+ -+ if self.compiler.future_imports: -+ self.printer.writeline("from __future__ import %s" % -+ (", ".join(self.compiler.future_imports),)) -+ self.printer.writeline("from mako import runtime, filters, cache") -+ self.printer.writeline("UNDEFINED = runtime.UNDEFINED") -+ self.printer.writeline("__M_dict_builtin = dict") -+ self.printer.writeline("__M_locals_builtin = locals") -+ self.printer.writeline("_magic_number = %r" % MAGIC_NUMBER) -+ self.printer.writeline("_modified_time = %r" % time.time()) -+ self.printer.writeline("_enable_loop = %r" % self.compiler.enable_loop) -+ self.printer.writeline( -+ "_template_filename = %r" % self.compiler.filename) -+ self.printer.writeline("_template_uri = %r" % self.compiler.uri) -+ self.printer.writeline( -+ "_source_encoding = %r" % self.compiler.source_encoding) -+ if self.compiler.imports: -+ buf = '' -+ for imp in self.compiler.imports: -+ buf += imp + "\n" -+ self.printer.writeline(imp) -+ impcode = ast.PythonCode( -+ buf, -+ source='', lineno=0, -+ pos=0, -+ filename='template defined imports') -+ else: -+ impcode = None -+ -+ main_identifiers = module_identifiers.branch(self.node) -+ module_identifiers.topleveldefs = \ -+ module_identifiers.topleveldefs.\ -+ union(main_identifiers.topleveldefs) -+ module_identifiers.declared.add("UNDEFINED") -+ if impcode: -+ module_identifiers.declared.update(impcode.declared_identifiers) -+ -+ self.compiler.identifiers = module_identifiers -+ self.printer.writeline("_exports = %r" % -+ [n.name for n in -+ main_identifiers.topleveldefs.values()] -+ ) -+ self.printer.write_blanks(2) -+ -+ if len(module_code): -+ self.write_module_code(module_code) -+ -+ if len(inherit): -+ self.write_namespaces(namespaces) -+ self.write_inherit(inherit[-1]) -+ elif len(namespaces): -+ self.write_namespaces(namespaces) -+ -+ return list(main_identifiers.topleveldefs.values()) -+ -+ def write_render_callable(self, node, name, args, buffered, filtered, -+ cached): -+ """write a top-level render callable. -+ -+ this could be the main render() method or that of a top-level def.""" -+ -+ if self.in_def: -+ decorator = node.decorator -+ if decorator: -+ self.printer.writeline( -+ "@runtime._decorate_toplevel(%s)" % decorator) -+ -+ self.printer.start_source(node.lineno) -+ self.printer.writelines( -+ "def %s(%s):" % (name, ','.join(args)), -+ # push new frame, assign current frame to __M_caller -+ "__M_caller = context.caller_stack._push_frame()", -+ "try:" -+ ) -+ if buffered or filtered or cached: -+ self.printer.writeline("context._push_buffer()") -+ -+ self.identifier_stack.append( -+ self.compiler.identifiers.branch(self.node)) -+ if (not self.in_def or self.node.is_block) and '**pageargs' in args: -+ self.identifier_stack[-1].argument_declared.add('pageargs') -+ -+ if not self.in_def and ( -+ len(self.identifiers.locally_assigned) > 0 or -+ len(self.identifiers.argument_declared) > 0 -+ ): -+ self.printer.writeline("__M_locals = __M_dict_builtin(%s)" % -+ ','.join([ -+ "%s=%s" % (x, x) for x in -+ self.identifiers.argument_declared -+ ])) -+ -+ self.write_variable_declares(self.identifiers, toplevel=True) -+ -+ for n in self.node.nodes: -+ n.accept_visitor(self) -+ -+ self.write_def_finish(self.node, buffered, filtered, cached) -+ self.printer.writeline(None) -+ self.printer.write_blanks(2) -+ if cached: -+ self.write_cache_decorator( -+ node, name, -+ args, buffered, -+ self.identifiers, toplevel=True) -+ -+ def write_module_code(self, module_code): -+ """write module-level template code, i.e. that which -+ is enclosed in <%! %> tags in the template.""" -+ for n in module_code: -+ self.printer.start_source(n.lineno) -+ self.printer.write_indented_block(n.text) -+ -+ def write_inherit(self, node): -+ """write the module-level inheritance-determination callable.""" -+ -+ self.printer.writelines( -+ "def _mako_inherit(template, context):", -+ "_mako_generate_namespaces(context)", -+ "return runtime._inherit_from(context, %s, _template_uri)" % -+ (node.parsed_attributes['file']), -+ None -+ ) -+ -+ def write_namespaces(self, namespaces): -+ """write the module-level namespace-generating callable.""" -+ self.printer.writelines( -+ "def _mako_get_namespace(context, name):", -+ "try:", -+ "return context.namespaces[(__name__, name)]", -+ "except KeyError:", -+ "_mako_generate_namespaces(context)", -+ "return context.namespaces[(__name__, name)]", -+ None, None -+ ) -+ self.printer.writeline("def _mako_generate_namespaces(context):") -+ -+ -+ for node in namespaces.values(): -+ if 'import' in node.attributes: -+ self.compiler.has_ns_imports = True -+ self.printer.start_source(node.lineno) -+ if len(node.nodes): -+ self.printer.writeline("def make_namespace():") -+ export = [] -+ identifiers = self.compiler.identifiers.branch(node) -+ self.in_def = True -+ class NSDefVisitor(object): -+ def visitDefTag(s, node): -+ s.visitDefOrBase(node) -+ -+ def visitBlockTag(s, node): -+ s.visitDefOrBase(node) -+ -+ def visitDefOrBase(s, node): -+ if node.is_anonymous: -+ raise exceptions.CompileException( -+ "Can't put anonymous blocks inside " -+ "<%namespace>", -+ **node.exception_kwargs -+ ) -+ self.write_inline_def(node, identifiers, nested=False) -+ export.append(node.funcname) -+ vis = NSDefVisitor() -+ for n in node.nodes: -+ n.accept_visitor(vis) -+ self.printer.writeline("return [%s]" % (','.join(export))) -+ self.printer.writeline(None) -+ self.in_def = False -+ callable_name = "make_namespace()" -+ else: -+ callable_name = "None" -+ -+ if 'file' in node.parsed_attributes: -+ self.printer.writeline( -+ "ns = runtime.TemplateNamespace(%r," -+ " context._clean_inheritance_tokens()," -+ " templateuri=%s, callables=%s, " -+ " calling_uri=_template_uri)" % -+ ( -+ node.name, -+ node.parsed_attributes.get('file', 'None'), -+ callable_name, -+ ) -+ ) -+ elif 'module' in node.parsed_attributes: -+ self.printer.writeline( -+ "ns = runtime.ModuleNamespace(%r," -+ " context._clean_inheritance_tokens()," -+ " callables=%s, calling_uri=_template_uri," -+ " module=%s)" % -+ ( -+ node.name, -+ callable_name, -+ node.parsed_attributes.get( -+ 'module', 'None') -+ ) -+ ) -+ else: -+ self.printer.writeline( -+ "ns = runtime.Namespace(%r," -+ " context._clean_inheritance_tokens()," -+ " callables=%s, calling_uri=_template_uri)" % -+ ( -+ node.name, -+ callable_name, -+ ) -+ ) -+ if eval(node.attributes.get('inheritable', "False")): -+ self.printer.writeline("context['self'].%s = ns" % (node.name)) -+ -+ self.printer.writeline( -+ "context.namespaces[(__name__, %s)] = ns" % repr(node.name)) -+ self.printer.write_blanks(1) -+ if not len(namespaces): -+ self.printer.writeline("pass") -+ self.printer.writeline(None) -+ -+ def write_variable_declares(self, identifiers, toplevel=False, limit=None): -+ """write variable declarations at the top of a function. -+ -+ the variable declarations are in the form of callable -+ definitions for defs and/or name lookup within the -+ function's context argument. the names declared are based -+ on the names that are referenced in the function body, -+ which don't otherwise have any explicit assignment -+ operation. names that are assigned within the body are -+ assumed to be locally-scoped variables and are not -+ separately declared. -+ -+ for def callable definitions, if the def is a top-level -+ callable then a 'stub' callable is generated which wraps -+ the current Context into a closure. if the def is not -+ top-level, it is fully rendered as a local closure. -+ -+ """ -+ -+ # collection of all defs available to us in this scope -+ comp_idents = dict([(c.funcname, c) for c in identifiers.defs]) -+ to_write = set() -+ -+ # write "context.get()" for all variables we are going to -+ # need that arent in the namespace yet -+ to_write = to_write.union(identifiers.undeclared) -+ -+ # write closure functions for closures that we define -+ # right here -+ to_write = to_write.union( -+ [c.funcname for c in identifiers.closuredefs.values()]) -+ -+ # remove identifiers that are declared in the argument -+ # signature of the callable -+ to_write = to_write.difference(identifiers.argument_declared) -+ -+ # remove identifiers that we are going to assign to. -+ # in this way we mimic Python's behavior, -+ # i.e. assignment to a variable within a block -+ # means that variable is now a "locally declared" var, -+ # which cannot be referenced beforehand. -+ to_write = to_write.difference(identifiers.locally_declared) -+ -+ if self.compiler.enable_loop: -+ has_loop = "loop" in to_write -+ to_write.discard("loop") -+ else: -+ has_loop = False -+ -+ # if a limiting set was sent, constraint to those items in that list -+ # (this is used for the caching decorator) -+ if limit is not None: -+ to_write = to_write.intersection(limit) -+ -+ if toplevel and getattr(self.compiler, 'has_ns_imports', False): -+ self.printer.writeline("_import_ns = {}") -+ self.compiler.has_imports = True -+ for ident, ns in self.compiler.namespaces.items(): -+ if 'import' in ns.attributes: -+ self.printer.writeline( -+ "_mako_get_namespace(context, %r)." -+ "_populate(_import_ns, %r)" % -+ ( -+ ident, -+ re.split(r'\s*,\s*', ns.attributes['import']) -+ )) -+ -+ if has_loop: -+ self.printer.writeline( -+ 'loop = __M_loop = runtime.LoopStack()' -+ ) -+ -+ for ident in to_write: -+ if ident in comp_idents: -+ comp = comp_idents[ident] -+ if comp.is_block: -+ if not comp.is_anonymous: -+ self.write_def_decl(comp, identifiers) -+ else: -+ self.write_inline_def(comp, identifiers, nested=True) -+ else: -+ if comp.is_root(): -+ self.write_def_decl(comp, identifiers) -+ else: -+ self.write_inline_def(comp, identifiers, nested=True) -+ -+ elif ident in self.compiler.namespaces: -+ self.printer.writeline( -+ "%s = _mako_get_namespace(context, %r)" % -+ (ident, ident) -+ ) -+ else: -+ if getattr(self.compiler, 'has_ns_imports', False): -+ if self.compiler.strict_undefined: -+ self.printer.writelines( -+ "%s = _import_ns.get(%r, UNDEFINED)" % -+ (ident, ident), -+ "if %s is UNDEFINED:" % ident, -+ "try:", -+ "%s = context[%r]" % (ident, ident), -+ "except KeyError:", -+ "raise NameError(\"'%s' is not defined\")" % -+ ident, -+ None, None -+ ) -+ else: -+ self.printer.writeline( -+ "%s = _import_ns.get(%r, context.get(%r, UNDEFINED))" % -+ (ident, ident, ident)) -+ else: -+ if self.compiler.strict_undefined: -+ self.printer.writelines( -+ "try:", -+ "%s = context[%r]" % (ident, ident), -+ "except KeyError:", -+ "raise NameError(\"'%s' is not defined\")" % -+ ident, -+ None -+ ) -+ else: -+ self.printer.writeline( -+ "%s = context.get(%r, UNDEFINED)" % (ident, ident) -+ ) -+ -+ self.printer.writeline("__M_writer = context.writer()") -+ -+ def write_def_decl(self, node, identifiers): -+ """write a locally-available callable referencing a top-level def""" -+ funcname = node.funcname -+ namedecls = node.get_argument_expressions() -+ nameargs = node.get_argument_expressions(as_call=True) -+ -+ if not self.in_def and ( -+ len(self.identifiers.locally_assigned) > 0 or -+ len(self.identifiers.argument_declared) > 0): -+ nameargs.insert(0, 'context._locals(__M_locals)') -+ else: -+ nameargs.insert(0, 'context') -+ self.printer.writeline("def %s(%s):" % (funcname, ",".join(namedecls))) -+ self.printer.writeline( -+ "return render_%s(%s)" % (funcname, ",".join(nameargs))) -+ self.printer.writeline(None) -+ -+ def write_inline_def(self, node, identifiers, nested): -+ """write a locally-available def callable inside an enclosing def.""" -+ -+ namedecls = node.get_argument_expressions() -+ -+ decorator = node.decorator -+ if decorator: -+ self.printer.writeline( -+ "@runtime._decorate_inline(context, %s)" % decorator) -+ self.printer.writeline( -+ "def %s(%s):" % (node.funcname, ",".join(namedecls))) -+ filtered = len(node.filter_args.args) > 0 -+ buffered = eval(node.attributes.get('buffered', 'False')) -+ cached = eval(node.attributes.get('cached', 'False')) -+ self.printer.writelines( -+ # push new frame, assign current frame to __M_caller -+ "__M_caller = context.caller_stack._push_frame()", -+ "try:" -+ ) -+ if buffered or filtered or cached: -+ self.printer.writelines( -+ "context._push_buffer()", -+ ) -+ -+ identifiers = identifiers.branch(node, nested=nested) -+ -+ self.write_variable_declares(identifiers) -+ -+ self.identifier_stack.append(identifiers) -+ for n in node.nodes: -+ n.accept_visitor(self) -+ self.identifier_stack.pop() -+ -+ self.write_def_finish(node, buffered, filtered, cached) -+ self.printer.writeline(None) -+ if cached: -+ self.write_cache_decorator(node, node.funcname, -+ namedecls, False, identifiers, -+ inline=True, toplevel=False) -+ -+ def write_def_finish(self, node, buffered, filtered, cached, -+ callstack=True): -+ """write the end section of a rendering function, either outermost or -+ inline. -+ -+ this takes into account if the rendering function was filtered, -+ buffered, etc. and closes the corresponding try: block if any, and -+ writes code to retrieve captured content, apply filters, send proper -+ return value.""" -+ -+ if not buffered and not cached and not filtered: -+ self.printer.writeline("return ''") -+ if callstack: -+ self.printer.writelines( -+ "finally:", -+ "context.caller_stack._pop_frame()", -+ None -+ ) -+ -+ if buffered or filtered or cached: -+ if buffered or cached: -+ # in a caching scenario, don't try to get a writer -+ # from the context after popping; assume the caching -+ # implemenation might be using a context with no -+ # extra buffers -+ self.printer.writelines( -+ "finally:", -+ "__M_buf = context._pop_buffer()" -+ ) -+ else: -+ self.printer.writelines( -+ "finally:", -+ "__M_buf, __M_writer = context._pop_buffer_and_writer()" -+ ) -+ -+ if callstack: -+ self.printer.writeline("context.caller_stack._pop_frame()") -+ -+ s = "__M_buf.getvalue()" -+ if filtered: -+ s = self.create_filter_callable(node.filter_args.args, s, -+ False) -+ self.printer.writeline(None) -+ if buffered and not cached: -+ s = self.create_filter_callable(self.compiler.buffer_filters, -+ s, False) -+ if buffered or cached: -+ self.printer.writeline("return %s" % s) -+ else: -+ self.printer.writelines( -+ "__M_writer(%s)" % s, -+ "return ''" -+ ) -+ -+ def write_cache_decorator(self, node_or_pagetag, name, -+ args, buffered, identifiers, -+ inline=False, toplevel=False): -+ """write a post-function decorator to replace a rendering -+ callable with a cached version of itself.""" -+ -+ self.printer.writeline("__M_%s = %s" % (name, name)) -+ cachekey = node_or_pagetag.parsed_attributes.get('cache_key', -+ repr(name)) -+ -+ cache_args = {} -+ if self.compiler.pagetag is not None: -+ cache_args.update( -+ ( -+ pa[6:], -+ self.compiler.pagetag.parsed_attributes[pa] -+ ) -+ for pa in self.compiler.pagetag.parsed_attributes -+ if pa.startswith('cache_') and pa != 'cache_key' -+ ) -+ cache_args.update( -+ ( -+ pa[6:], -+ node_or_pagetag.parsed_attributes[pa] -+ ) for pa in node_or_pagetag.parsed_attributes -+ if pa.startswith('cache_') and pa != 'cache_key' -+ ) -+ if 'timeout' in cache_args: -+ cache_args['timeout'] = int(eval(cache_args['timeout'])) -+ -+ self.printer.writeline("def %s(%s):" % (name, ','.join(args))) -+ -+ # form "arg1, arg2, arg3=arg3, arg4=arg4", etc. -+ pass_args = [ -+ "%s=%s" % ((a.split('=')[0],) * 2) if '=' in a else a -+ for a in args -+ ] -+ -+ self.write_variable_declares( -+ identifiers, -+ toplevel=toplevel, -+ limit=node_or_pagetag.undeclared_identifiers() -+ ) -+ if buffered: -+ s = "context.get('local')."\ -+ "cache._ctx_get_or_create("\ -+ "%s, lambda:__M_%s(%s), context, %s__M_defname=%r)" % ( -+ cachekey, name, ','.join(pass_args), -+ ''.join(["%s=%s, " % (k, v) -+ for k, v in cache_args.items()]), -+ name -+ ) -+ # apply buffer_filters -+ s = self.create_filter_callable(self.compiler.buffer_filters, s, -+ False) -+ self.printer.writelines("return " + s, None) -+ else: -+ self.printer.writelines( -+ "__M_writer(context.get('local')." -+ "cache._ctx_get_or_create(" -+ "%s, lambda:__M_%s(%s), context, %s__M_defname=%r))" % -+ ( -+ cachekey, name, ','.join(pass_args), -+ ''.join(["%s=%s, " % (k, v) -+ for k, v in cache_args.items()]), -+ name, -+ ), -+ "return ''", -+ None -+ ) -+ -+ def create_filter_callable(self, args, target, is_expression): -+ """write a filter-applying expression based on the filters -+ present in the given filter names, adjusting for the global -+ 'default' filter aliases as needed.""" -+ -+ def locate_encode(name): -+ if re.match(r'decode\..+', name): -+ return "filters." + name -+ elif self.compiler.disable_unicode: -+ return filters.NON_UNICODE_ESCAPES.get(name, name) -+ else: -+ return filters.DEFAULT_ESCAPES.get(name, name) -+ -+ if 'n' not in args: -+ if is_expression: -+ if self.compiler.pagetag: -+ args = self.compiler.pagetag.filter_args.args + args -+ if self.compiler.default_filters: -+ args = self.compiler.default_filters + args -+ for e in args: -+ # if filter given as a function, get just the identifier portion -+ if e == 'n': -+ continue -+ m = re.match(r'(.+?)(\(.*\))', e) -+ if m: -+ ident, fargs = m.group(1, 2) -+ f = locate_encode(ident) -+ e = f + fargs -+ else: -+ e = locate_encode(e) -+ assert e is not None -+ target = "%s(%s)" % (e, target) -+ return target -+ -+ def visitExpression(self, node): -+ self.printer.start_source(node.lineno) -+ if len(node.escapes) or \ -+ ( -+ self.compiler.pagetag is not None and -+ len(self.compiler.pagetag.filter_args.args) -+ ) or \ -+ len(self.compiler.default_filters): -+ -+ s = self.create_filter_callable(node.escapes_code.args, -+ "%s" % node.text, True) -+ self.printer.writeline("__M_writer(%s)" % s) -+ else: -+ self.printer.writeline("__M_writer(%s)" % node.text) -+ -+ def visitControlLine(self, node): -+ if node.isend: -+ self.printer.writeline(None) -+ if node.has_loop_context: -+ self.printer.writeline('finally:') -+ self.printer.writeline("loop = __M_loop._exit()") -+ self.printer.writeline(None) -+ else: -+ self.printer.start_source(node.lineno) -+ if self.compiler.enable_loop and node.keyword == 'for': -+ text = mangle_mako_loop(node, self.printer) -+ else: -+ text = node.text -+ self.printer.writeline(text) -+ children = node.get_children() -+ # this covers the three situations where we want to insert a pass: -+ # 1) a ternary control line with no children, -+ # 2) a primary control line with nothing but its own ternary -+ # and end control lines, and -+ # 3) any control line with no content other than comments -+ if not children or ( -+ compat.all(isinstance(c, (parsetree.Comment, -+ parsetree.ControlLine)) -+ for c in children) and -+ compat.all((node.is_ternary(c.keyword) or c.isend) -+ for c in children -+ if isinstance(c, parsetree.ControlLine))): -+ self.printer.writeline("pass") -+ -+ def visitText(self, node): -+ self.printer.start_source(node.lineno) -+ self.printer.writeline("__M_writer(%s)" % repr(node.content)) -+ -+ def visitTextTag(self, node): -+ filtered = len(node.filter_args.args) > 0 -+ if filtered: -+ self.printer.writelines( -+ "__M_writer = context._push_writer()", -+ "try:", -+ ) -+ for n in node.nodes: -+ n.accept_visitor(self) -+ if filtered: -+ self.printer.writelines( -+ "finally:", -+ "__M_buf, __M_writer = context._pop_buffer_and_writer()", -+ "__M_writer(%s)" % -+ self.create_filter_callable( -+ node.filter_args.args, -+ "__M_buf.getvalue()", -+ False), -+ None -+ ) -+ -+ def visitCode(self, node): -+ if not node.ismodule: -+ self.printer.start_source(node.lineno) -+ self.printer.write_indented_block(node.text) -+ -+ if not self.in_def and len(self.identifiers.locally_assigned) > 0: -+ # if we are the "template" def, fudge locally -+ # declared/modified variables into the "__M_locals" dictionary, -+ # which is used for def calls within the same template, -+ # to simulate "enclosing scope" -+ self.printer.writeline( -+ '__M_locals_builtin_stored = __M_locals_builtin()') -+ self.printer.writeline( -+ '__M_locals.update(__M_dict_builtin([(__M_key,' -+ ' __M_locals_builtin_stored[__M_key]) for __M_key in' -+ ' [%s] if __M_key in __M_locals_builtin_stored]))' % -+ ','.join([repr(x) for x in node.declared_identifiers()])) -+ -+ def visitIncludeTag(self, node): -+ self.printer.start_source(node.lineno) -+ args = node.attributes.get('args') -+ if args: -+ self.printer.writeline( -+ "runtime._include_file(context, %s, _template_uri, %s)" % -+ (node.parsed_attributes['file'], args)) -+ else: -+ self.printer.writeline( -+ "runtime._include_file(context, %s, _template_uri)" % -+ (node.parsed_attributes['file'])) -+ -+ def visitNamespaceTag(self, node): -+ pass -+ -+ def visitDefTag(self, node): -+ pass -+ -+ def visitBlockTag(self, node): -+ if node.is_anonymous: -+ self.printer.writeline("%s()" % node.funcname) -+ else: -+ nameargs = node.get_argument_expressions(as_call=True) -+ nameargs += ['**pageargs'] -+ self.printer.writeline("if 'parent' not in context._data or " -+ "not hasattr(context._data['parent'], '%s'):" -+ % node.funcname) -+ self.printer.writeline( -+ "context['self'].%s(%s)" % (node.funcname, ",".join(nameargs))) -+ self.printer.writeline("\n") -+ -+ def visitCallNamespaceTag(self, node): -+ # TODO: we can put namespace-specific checks here, such -+ # as ensure the given namespace will be imported, -+ # pre-import the namespace, etc. -+ self.visitCallTag(node) -+ -+ def visitCallTag(self, node): -+ self.printer.writeline("def ccall(caller):") -+ export = ['body'] -+ callable_identifiers = self.identifiers.branch(node, nested=True) -+ body_identifiers = callable_identifiers.branch(node, nested=False) -+ # we want the 'caller' passed to ccall to be used -+ # for the body() function, but for other non-body() -+ # <%def>s within <%call> we want the current caller -+ # off the call stack (if any) -+ body_identifiers.add_declared('caller') -+ -+ self.identifier_stack.append(body_identifiers) -+ class DefVisitor(object): -+ def visitDefTag(s, node): -+ s.visitDefOrBase(node) -+ -+ def visitBlockTag(s, node): -+ s.visitDefOrBase(node) -+ -+ def visitDefOrBase(s, node): -+ self.write_inline_def(node, callable_identifiers, nested=False) -+ if not node.is_anonymous: -+ export.append(node.funcname) -+ # remove defs that are within the <%call> from the -+ # "closuredefs" defined in the body, so they dont render twice -+ if node.funcname in body_identifiers.closuredefs: -+ del body_identifiers.closuredefs[node.funcname] -+ -+ vis = DefVisitor() -+ for n in node.nodes: -+ n.accept_visitor(vis) -+ self.identifier_stack.pop() -+ -+ bodyargs = node.body_decl.get_argument_expressions() -+ self.printer.writeline("def body(%s):" % ','.join(bodyargs)) -+ -+ # TODO: figure out best way to specify -+ # buffering/nonbuffering (at call time would be better) -+ buffered = False -+ if buffered: -+ self.printer.writelines( -+ "context._push_buffer()", -+ "try:" -+ ) -+ self.write_variable_declares(body_identifiers) -+ self.identifier_stack.append(body_identifiers) -+ -+ for n in node.nodes: -+ n.accept_visitor(self) -+ self.identifier_stack.pop() -+ -+ self.write_def_finish(node, buffered, False, False, callstack=False) -+ self.printer.writelines( -+ None, -+ "return [%s]" % (','.join(export)), -+ None -+ ) -+ -+ self.printer.writelines( -+ # push on caller for nested call -+ "context.caller_stack.nextcaller = " -+ "runtime.Namespace('caller', context, " -+ "callables=ccall(__M_caller))", -+ "try:") -+ self.printer.start_source(node.lineno) -+ self.printer.writelines( -+ "__M_writer(%s)" % self.create_filter_callable( -+ [], node.expression, True), -+ "finally:", -+ "context.caller_stack.nextcaller = None", -+ None -+ ) -+ -+class _Identifiers(object): -+ """tracks the status of identifier names as template code is rendered.""" -+ -+ def __init__(self, compiler, node=None, parent=None, nested=False): -+ if parent is not None: -+ # if we are the branch created in write_namespaces(), -+ # we don't share any context from the main body(). -+ if isinstance(node, parsetree.NamespaceTag): -+ self.declared = set() -+ self.topleveldefs = util.SetLikeDict() -+ else: -+ # things that have already been declared -+ # in an enclosing namespace (i.e. names we can just use) -+ self.declared = set(parent.declared).\ -+ union([c.name for c in parent.closuredefs.values()]).\ -+ union(parent.locally_declared).\ -+ union(parent.argument_declared) -+ -+ # if these identifiers correspond to a "nested" -+ # scope, it means whatever the parent identifiers -+ # had as undeclared will have been declared by that parent, -+ # and therefore we have them in our scope. -+ if nested: -+ self.declared = self.declared.union(parent.undeclared) -+ -+ # top level defs that are available -+ self.topleveldefs = util.SetLikeDict(**parent.topleveldefs) -+ else: -+ self.declared = set() -+ self.topleveldefs = util.SetLikeDict() -+ -+ self.compiler = compiler -+ -+ # things within this level that are referenced before they -+ # are declared (e.g. assigned to) -+ self.undeclared = set() -+ -+ # things that are declared locally. some of these things -+ # could be in the "undeclared" list as well if they are -+ # referenced before declared -+ self.locally_declared = set() -+ -+ # assignments made in explicit python blocks. -+ # these will be propagated to -+ # the context of local def calls. -+ self.locally_assigned = set() -+ -+ # things that are declared in the argument -+ # signature of the def callable -+ self.argument_declared = set() -+ -+ # closure defs that are defined in this level -+ self.closuredefs = util.SetLikeDict() -+ -+ self.node = node -+ -+ if node is not None: -+ node.accept_visitor(self) -+ -+ illegal_names = self.compiler.reserved_names.intersection( -+ self.locally_declared) -+ if illegal_names: -+ raise exceptions.NameConflictError( -+ "Reserved words declared in template: %s" % -+ ", ".join(illegal_names)) -+ -+ -+ def branch(self, node, **kwargs): -+ """create a new Identifiers for a new Node, with -+ this Identifiers as the parent.""" -+ -+ return _Identifiers(self.compiler, node, self, **kwargs) -+ -+ @property -+ def defs(self): -+ return set(self.topleveldefs.union(self.closuredefs).values()) -+ -+ def __repr__(self): -+ return "Identifiers(declared=%r, locally_declared=%r, "\ -+ "undeclared=%r, topleveldefs=%r, closuredefs=%r, "\ -+ "argumentdeclared=%r)" %\ -+ ( -+ list(self.declared), -+ list(self.locally_declared), -+ list(self.undeclared), -+ [c.name for c in self.topleveldefs.values()], -+ [c.name for c in self.closuredefs.values()], -+ self.argument_declared) -+ -+ def check_declared(self, node): -+ """update the state of this Identifiers with the undeclared -+ and declared identifiers of the given node.""" -+ -+ for ident in node.undeclared_identifiers(): -+ if ident != 'context' and\ -+ ident not in self.declared.union(self.locally_declared): -+ self.undeclared.add(ident) -+ for ident in node.declared_identifiers(): -+ self.locally_declared.add(ident) -+ -+ def add_declared(self, ident): -+ self.declared.add(ident) -+ if ident in self.undeclared: -+ self.undeclared.remove(ident) -+ -+ def visitExpression(self, node): -+ self.check_declared(node) -+ -+ def visitControlLine(self, node): -+ self.check_declared(node) -+ -+ def visitCode(self, node): -+ if not node.ismodule: -+ self.check_declared(node) -+ self.locally_assigned = self.locally_assigned.union( -+ node.declared_identifiers()) -+ -+ def visitNamespaceTag(self, node): -+ # only traverse into the sub-elements of a -+ # <%namespace> tag if we are the branch created in -+ # write_namespaces() -+ if self.node is node: -+ for n in node.nodes: -+ n.accept_visitor(self) -+ -+ def _check_name_exists(self, collection, node): -+ existing = collection.get(node.funcname) -+ collection[node.funcname] = node -+ if existing is not None and \ -+ existing is not node and \ -+ (node.is_block or existing.is_block): -+ raise exceptions.CompileException( -+ "%%def or %%block named '%s' already " -+ "exists in this template." % -+ node.funcname, **node.exception_kwargs) -+ -+ def visitDefTag(self, node): -+ if node.is_root() and not node.is_anonymous: -+ self._check_name_exists(self.topleveldefs, node) -+ elif node is not self.node: -+ self._check_name_exists(self.closuredefs, node) -+ -+ for ident in node.undeclared_identifiers(): -+ if ident != 'context' and \ -+ ident not in self.declared.union(self.locally_declared): -+ self.undeclared.add(ident) -+ -+ # visit defs only one level deep -+ if node is self.node: -+ for ident in node.declared_identifiers(): -+ self.argument_declared.add(ident) -+ -+ for n in node.nodes: -+ n.accept_visitor(self) -+ -+ def visitBlockTag(self, node): -+ if node is not self.node and not node.is_anonymous: -+ -+ if isinstance(self.node, parsetree.DefTag): -+ raise exceptions.CompileException( -+ "Named block '%s' not allowed inside of def '%s'" -+ % (node.name, self.node.name), **node.exception_kwargs) -+ elif isinstance(self.node, -+ (parsetree.CallTag, parsetree.CallNamespaceTag)): -+ raise exceptions.CompileException( -+ "Named block '%s' not allowed inside of <%%call> tag" -+ % (node.name, ), **node.exception_kwargs) -+ -+ for ident in node.undeclared_identifiers(): -+ if ident != 'context' and \ -+ ident not in self.declared.union(self.locally_declared): -+ self.undeclared.add(ident) -+ -+ if not node.is_anonymous: -+ self._check_name_exists(self.topleveldefs, node) -+ self.undeclared.add(node.funcname) -+ elif node is not self.node: -+ self._check_name_exists(self.closuredefs, node) -+ for ident in node.declared_identifiers(): -+ self.argument_declared.add(ident) -+ for n in node.nodes: -+ n.accept_visitor(self) -+ -+ def visitTextTag(self, node): -+ for ident in node.undeclared_identifiers(): -+ if ident != 'context' and \ -+ ident not in self.declared.union(self.locally_declared): -+ self.undeclared.add(ident) -+ -+ def visitIncludeTag(self, node): -+ self.check_declared(node) -+ -+ def visitPageTag(self, node): -+ for ident in node.declared_identifiers(): -+ self.argument_declared.add(ident) -+ self.check_declared(node) -+ -+ def visitCallNamespaceTag(self, node): -+ self.visitCallTag(node) -+ -+ def visitCallTag(self, node): -+ if node is self.node: -+ for ident in node.undeclared_identifiers(): -+ if ident != 'context' and \ -+ ident not in self.declared.union( -+ self.locally_declared): -+ self.undeclared.add(ident) -+ for ident in node.declared_identifiers(): -+ self.argument_declared.add(ident) -+ for n in node.nodes: -+ n.accept_visitor(self) -+ else: -+ for ident in node.undeclared_identifiers(): -+ if ident != 'context' and \ -+ ident not in self.declared.union( -+ self.locally_declared): -+ self.undeclared.add(ident) -+ -+ -+_FOR_LOOP = re.compile( -+ r'^for\s+((?:\(?)\s*[A-Za-z_][A-Za-z_0-9]*' -+ r'(?:\s*,\s*(?:[A-Za-z_][A-Za-z0-9_]*),??)*\s*(?:\)?))\s+in\s+(.*):' -+) -+ -+def mangle_mako_loop(node, printer): -+ """converts a for loop into a context manager wrapped around a for loop -+ when access to the `loop` variable has been detected in the for loop body -+ """ -+ loop_variable = LoopVariable() -+ node.accept_visitor(loop_variable) -+ if loop_variable.detected: -+ node.nodes[-1].has_loop_context = True -+ match = _FOR_LOOP.match(node.text) -+ if match: -+ printer.writelines( -+ 'loop = __M_loop._enter(%s)' % match.group(2), -+ 'try:' -+ #'with __M_loop(%s) as loop:' % match.group(2) -+ ) -+ text = 'for %s in loop:' % match.group(1) -+ else: -+ raise SyntaxError("Couldn't apply loop context: %s" % node.text) -+ else: -+ text = node.text -+ return text -+ -+ -+class LoopVariable(object): -+ """A node visitor which looks for the name 'loop' within undeclared -+ identifiers.""" -+ -+ def __init__(self): -+ self.detected = False -+ -+ def _loop_reference_detected(self, node): -+ if 'loop' in node.undeclared_identifiers(): -+ self.detected = True -+ else: -+ for n in node.get_children(): -+ n.accept_visitor(self) -+ -+ def visitControlLine(self, node): -+ self._loop_reference_detected(node) -+ -+ def visitCode(self, node): -+ self._loop_reference_detected(node) -+ -+ def visitExpression(self, node): -+ self._loop_reference_detected(node) -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py -new file mode 100644 -index 0000000..fe277bb ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py -@@ -0,0 +1,174 @@ -+import sys -+import time -+ -+py3k = sys.version_info >= (3, 0) -+py33 = sys.version_info >= (3, 3) -+py2k = sys.version_info < (3,) -+py26 = sys.version_info >= (2, 6) -+jython = sys.platform.startswith('java') -+win32 = sys.platform.startswith('win') -+pypy = hasattr(sys, 'pypy_version_info') -+ -+if py3k: -+ from io import StringIO -+ import builtins as compat_builtins -+ from urllib.parse import quote_plus, unquote_plus -+ from html.entities import codepoint2name, name2codepoint -+ string_types = str, -+ binary_type = bytes -+ text_type = str -+ -+ from io import BytesIO as byte_buffer -+ -+ def u(s): -+ return s -+ -+ def b(s): -+ return s.encode("latin-1") -+ -+ def octal(lit): -+ return eval("0o" + lit) -+ -+else: -+ import __builtin__ as compat_builtins -+ try: -+ from cStringIO import StringIO -+ except: -+ from StringIO import StringIO -+ -+ byte_buffer = StringIO -+ -+ from urllib import quote_plus, unquote_plus -+ from htmlentitydefs import codepoint2name, name2codepoint -+ string_types = basestring, -+ binary_type = str -+ text_type = unicode -+ -+ def u(s): -+ return unicode(s, "utf-8") -+ -+ def b(s): -+ return s -+ -+ def octal(lit): -+ return eval("0" + lit) -+ -+ -+if py33: -+ from importlib import machinery -+ def load_module(module_id, path): -+ return machinery.SourceFileLoader(module_id, path).load_module() -+else: -+ import imp -+ def load_module(module_id, path): -+ fp = open(path, 'rb') -+ try: -+ return imp.load_source(module_id, path, fp) -+ finally: -+ fp.close() -+ -+ -+if py3k: -+ def reraise(tp, value, tb=None, cause=None): -+ if cause is not None: -+ value.__cause__ = cause -+ if value.__traceback__ is not tb: -+ raise value.with_traceback(tb) -+ raise value -+else: -+ exec("def reraise(tp, value, tb=None, cause=None):\n" -+ " raise tp, value, tb\n") -+ -+ -+def exception_as(): -+ return sys.exc_info()[1] -+ -+try: -+ import threading -+ if py3k: -+ import _thread as thread -+ else: -+ import thread -+except ImportError: -+ import dummy_threading as threading -+ if py3k: -+ import _dummy_thread as thread -+ else: -+ import dummy_thread as thread -+ -+if win32 or jython: -+ time_func = time.clock -+else: -+ time_func = time.time -+ -+try: -+ from functools import partial -+except: -+ def partial(func, *args, **keywords): -+ def newfunc(*fargs, **fkeywords): -+ newkeywords = keywords.copy() -+ newkeywords.update(fkeywords) -+ return func(*(args + fargs), **newkeywords) -+ return newfunc -+ -+ -+all = all -+import json -+ -+def exception_name(exc): -+ return exc.__class__.__name__ -+ -+try: -+ from inspect import CO_VARKEYWORDS, CO_VARARGS -+ def inspect_func_args(fn): -+ if py3k: -+ co = fn.__code__ -+ else: -+ co = fn.func_code -+ -+ nargs = co.co_argcount -+ names = co.co_varnames -+ args = list(names[:nargs]) -+ -+ varargs = None -+ if co.co_flags & CO_VARARGS: -+ varargs = co.co_varnames[nargs] -+ nargs = nargs + 1 -+ varkw = None -+ if co.co_flags & CO_VARKEYWORDS: -+ varkw = co.co_varnames[nargs] -+ -+ if py3k: -+ return args, varargs, varkw, fn.__defaults__ -+ else: -+ return args, varargs, varkw, fn.func_defaults -+except ImportError: -+ import inspect -+ def inspect_func_args(fn): -+ return inspect.getargspec(fn) -+ -+if py3k: -+ def callable(fn): -+ return hasattr(fn, '__call__') -+else: -+ callable = callable -+ -+ -+################################################ -+# cross-compatible metaclass implementation -+# Copyright (c) 2010-2012 Benjamin Peterson -+def with_metaclass(meta, base=object): -+ """Create a base class with a metaclass.""" -+ return meta("%sBase" % meta.__name__, (base,), {}) -+################################################ -+ -+ -+def arg_stringname(func_arg): -+ """Gets the string name of a kwarg or vararg -+ In Python3.4 a function's args are -+ of _ast.arg type not _ast.name -+ """ -+ if hasattr(func_arg, 'arg'): -+ return func_arg.arg -+ else: -+ return str(func_arg) -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py -new file mode 100644 -index 0000000..c531f21 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py -@@ -0,0 +1,373 @@ -+# mako/exceptions.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+"""exception classes""" -+ -+import traceback -+import sys -+from mako import util, compat -+ -+class MakoException(Exception): -+ pass -+ -+class RuntimeException(MakoException): -+ pass -+ -+def _format_filepos(lineno, pos, filename): -+ if filename is None: -+ return " at line: %d char: %d" % (lineno, pos) -+ else: -+ return " in file '%s' at line: %d char: %d" % (filename, lineno, pos) -+ -+ -+class CompileException(MakoException): -+ def __init__(self, message, source, lineno, pos, filename): -+ MakoException.__init__(self, -+ message + _format_filepos(lineno, pos, filename)) -+ self.lineno = lineno -+ self.pos = pos -+ self.filename = filename -+ self.source = source -+ -+class SyntaxException(MakoException): -+ def __init__(self, message, source, lineno, pos, filename): -+ MakoException.__init__(self, -+ message + _format_filepos(lineno, pos, filename)) -+ self.lineno = lineno -+ self.pos = pos -+ self.filename = filename -+ self.source = source -+ -+class UnsupportedError(MakoException): -+ """raised when a retired feature is used.""" -+ -+class NameConflictError(MakoException): -+ """raised when a reserved word is used inappropriately""" -+ -+class TemplateLookupException(MakoException): -+ pass -+ -+class TopLevelLookupException(TemplateLookupException): -+ pass -+ -+class RichTraceback(object): -+ """Pull the current exception from the ``sys`` traceback and extracts -+ Mako-specific template information. -+ -+ See the usage examples in :ref:`handling_exceptions`. -+ -+ """ -+ def __init__(self, error=None, traceback=None): -+ self.source, self.lineno = "", 0 -+ -+ if error is None or traceback is None: -+ t, value, tback = sys.exc_info() -+ -+ if error is None: -+ error = value or t -+ -+ if traceback is None: -+ traceback = tback -+ -+ self.error = error -+ self.records = self._init(traceback) -+ -+ if isinstance(self.error, (CompileException, SyntaxException)): -+ self.source = self.error.source -+ self.lineno = self.error.lineno -+ self._has_source = True -+ -+ self._init_message() -+ -+ @property -+ def errorname(self): -+ return compat.exception_name(self.error) -+ -+ def _init_message(self): -+ """Find a unicode representation of self.error""" -+ try: -+ self.message = compat.text_type(self.error) -+ except UnicodeError: -+ try: -+ self.message = str(self.error) -+ except UnicodeEncodeError: -+ # Fallback to args as neither unicode nor -+ # str(Exception(u'\xe6')) work in Python < 2.6 -+ self.message = self.error.args[0] -+ if not isinstance(self.message, compat.text_type): -+ self.message = compat.text_type(self.message, 'ascii', 'replace') -+ -+ def _get_reformatted_records(self, records): -+ for rec in records: -+ if rec[6] is not None: -+ yield (rec[4], rec[5], rec[2], rec[6]) -+ else: -+ yield tuple(rec[0:4]) -+ -+ @property -+ def traceback(self): -+ """Return a list of 4-tuple traceback records (i.e. normal python -+ format) with template-corresponding lines remapped to the originating -+ template. -+ -+ """ -+ return list(self._get_reformatted_records(self.records)) -+ -+ @property -+ def reverse_records(self): -+ return reversed(self.records) -+ -+ @property -+ def reverse_traceback(self): -+ """Return the same data as traceback, except in reverse order. -+ """ -+ -+ return list(self._get_reformatted_records(self.reverse_records)) -+ -+ def _init(self, trcback): -+ """format a traceback from sys.exc_info() into 7-item tuples, -+ containing the regular four traceback tuple items, plus the original -+ template filename, the line number adjusted relative to the template -+ source, and code line from that line number of the template.""" -+ -+ import mako.template -+ mods = {} -+ rawrecords = traceback.extract_tb(trcback) -+ new_trcback = [] -+ for filename, lineno, function, line in rawrecords: -+ if not line: -+ line = '' -+ try: -+ (line_map, template_lines) = mods[filename] -+ except KeyError: -+ try: -+ info = mako.template._get_module_info(filename) -+ module_source = info.code -+ template_source = info.source -+ template_filename = info.template_filename or filename -+ except KeyError: -+ # A normal .py file (not a Template) -+ if not compat.py3k: -+ try: -+ fp = open(filename, 'rb') -+ encoding = util.parse_encoding(fp) -+ fp.close() -+ except IOError: -+ encoding = None -+ if encoding: -+ line = line.decode(encoding) -+ else: -+ line = line.decode('ascii', 'replace') -+ new_trcback.append((filename, lineno, function, line, -+ None, None, None, None)) -+ continue -+ -+ template_ln = 1 -+ -+ source_map = mako.template.ModuleInfo.\ -+ get_module_source_metadata( -+ module_source, full_line_map=True) -+ line_map = source_map['full_line_map'] -+ -+ template_lines = [line for line in -+ template_source.split("\n")] -+ mods[filename] = (line_map, template_lines) -+ -+ template_ln = line_map[lineno - 1] -+ -+ if template_ln <= len(template_lines): -+ template_line = template_lines[template_ln - 1] -+ else: -+ template_line = None -+ new_trcback.append((filename, lineno, function, -+ line, template_filename, template_ln, -+ template_line, template_source)) -+ if not self.source: -+ for l in range(len(new_trcback) - 1, 0, -1): -+ if new_trcback[l][5]: -+ self.source = new_trcback[l][7] -+ self.lineno = new_trcback[l][5] -+ break -+ else: -+ if new_trcback: -+ try: -+ # A normal .py file (not a Template) -+ fp = open(new_trcback[-1][0], 'rb') -+ encoding = util.parse_encoding(fp) -+ fp.seek(0) -+ self.source = fp.read() -+ fp.close() -+ if encoding: -+ self.source = self.source.decode(encoding) -+ except IOError: -+ self.source = '' -+ self.lineno = new_trcback[-1][1] -+ return new_trcback -+ -+ -+def text_error_template(lookup=None): -+ """Provides a template that renders a stack trace in a similar format to -+ the Python interpreter, substituting source template filenames, line -+ numbers and code for that of the originating source template, as -+ applicable. -+ -+ """ -+ import mako.template -+ return mako.template.Template(r""" -+<%page args="error=None, traceback=None"/> -+<%! -+ from mako.exceptions import RichTraceback -+%>\ -+<% -+ tback = RichTraceback(error=error, traceback=traceback) -+%>\ -+Traceback (most recent call last): -+% for (filename, lineno, function, line) in tback.traceback: -+ File "${filename}", line ${lineno}, in ${function or '?'} -+ ${line | trim} -+% endfor -+${tback.errorname}: ${tback.message} -+""") -+ -+ -+def _install_pygments(): -+ global syntax_highlight, pygments_html_formatter -+ from mako.ext.pygmentplugin import syntax_highlight,\ -+ pygments_html_formatter -+ -+def _install_fallback(): -+ global syntax_highlight, pygments_html_formatter -+ from mako.filters import html_escape -+ pygments_html_formatter = None -+ def syntax_highlight(filename='', language=None): -+ return html_escape -+ -+def _install_highlighting(): -+ try: -+ _install_pygments() -+ except ImportError: -+ _install_fallback() -+_install_highlighting() -+ -+def html_error_template(): -+ """Provides a template that renders a stack trace in an HTML format, -+ providing an excerpt of code as well as substituting source template -+ filenames, line numbers and code for that of the originating source -+ template, as applicable. -+ -+ The template's default ``encoding_errors`` value is -+ ``'htmlentityreplace'``. The template has two options. With the -+ ``full`` option disabled, only a section of an HTML document is -+ returned. With the ``css`` option disabled, the default stylesheet -+ won't be included. -+ -+ """ -+ import mako.template -+ return mako.template.Template(r""" -+<%! -+ from mako.exceptions import RichTraceback, syntax_highlight,\ -+ pygments_html_formatter -+%> -+<%page args="full=True, css=True, error=None, traceback=None"/> -+% if full: -+ -+ -+ Mako Runtime Error -+% endif -+% if css: -+ -+% endif -+% if full: -+ -+ -+% endif -+ -+

Error !

-+<% -+ tback = RichTraceback(error=error, traceback=traceback) -+ src = tback.source -+ line = tback.lineno -+ if src: -+ lines = src.split('\n') -+ else: -+ lines = None -+%> -+

${tback.errorname}: ${tback.message|h}

-+ -+% if lines: -+
-+
-+% for index in range(max(0, line-4),min(len(lines), line+5)): -+ <% -+ if pygments_html_formatter: -+ pygments_html_formatter.linenostart = index + 1 -+ %> -+ % if index + 1 == line: -+ <% -+ if pygments_html_formatter: -+ old_cssclass = pygments_html_formatter.cssclass -+ pygments_html_formatter.cssclass = 'error ' + old_cssclass -+ %> -+ ${lines[index] | syntax_highlight(language='mako')} -+ <% -+ if pygments_html_formatter: -+ pygments_html_formatter.cssclass = old_cssclass -+ %> -+ % else: -+ ${lines[index] | syntax_highlight(language='mako')} -+ % endif -+% endfor -+
-+
-+% endif -+ -+
-+% for (filename, lineno, function, line) in tback.reverse_traceback: -+
${filename}, line ${lineno}:
-+
-+ <% -+ if pygments_html_formatter: -+ pygments_html_formatter.linenostart = lineno -+ %> -+
${line | syntax_highlight(filename)}
-+
-+% endfor -+
-+ -+% if full: -+ -+ -+% endif -+""", output_encoding=sys.getdefaultencoding(), -+ encoding_errors='htmlentityreplace') -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py -new file mode 100644 -index 0000000..d79ce23 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py -@@ -0,0 +1,201 @@ -+# mako/filters.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+ -+import re -+import codecs -+ -+from mako.compat import quote_plus, unquote_plus, codepoint2name, \ -+ name2codepoint -+ -+from mako import compat -+ -+xml_escapes = { -+ '&': '&', -+ '>': '>', -+ '<': '<', -+ '"': '"', # also " in html-only -+ "'": ''' # also ' in html-only -+} -+ -+# XXX: " is valid in HTML and XML -+# ' is not valid HTML, but is valid XML -+ -+def legacy_html_escape(s): -+ """legacy HTML escape for non-unicode mode.""" -+ s = s.replace("&", "&") -+ s = s.replace(">", ">") -+ s = s.replace("<", "<") -+ s = s.replace('"', """) -+ s = s.replace("'", "'") -+ return s -+ -+ -+try: -+ import markupsafe -+ html_escape = markupsafe.escape -+except ImportError: -+ html_escape = legacy_html_escape -+ -+def xml_escape(string): -+ return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string) -+ -+def url_escape(string): -+ # convert into a list of octets -+ string = string.encode("utf8") -+ return quote_plus(string) -+ -+def legacy_url_escape(string): -+ # convert into a list of octets -+ return quote_plus(string) -+ -+def url_unescape(string): -+ text = unquote_plus(string) -+ if not is_ascii_str(text): -+ text = text.decode("utf8") -+ return text -+ -+def trim(string): -+ return string.strip() -+ -+ -+class Decode(object): -+ def __getattr__(self, key): -+ def decode(x): -+ if isinstance(x, compat.text_type): -+ return x -+ elif not isinstance(x, compat.binary_type): -+ return decode(str(x)) -+ else: -+ return compat.text_type(x, encoding=key) -+ return decode -+decode = Decode() -+ -+ -+_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z') -+ -+def is_ascii_str(text): -+ return isinstance(text, str) and _ASCII_re.match(text) -+ -+################################################################ -+ -+class XMLEntityEscaper(object): -+ def __init__(self, codepoint2name, name2codepoint): -+ self.codepoint2entity = dict([(c, compat.text_type('&%s;' % n)) -+ for c, n in codepoint2name.items()]) -+ self.name2codepoint = name2codepoint -+ -+ def escape_entities(self, text): -+ """Replace characters with their character entity references. -+ -+ Only characters corresponding to a named entity are replaced. -+ """ -+ return compat.text_type(text).translate(self.codepoint2entity) -+ -+ def __escape(self, m): -+ codepoint = ord(m.group()) -+ try: -+ return self.codepoint2entity[codepoint] -+ except (KeyError, IndexError): -+ return '&#x%X;' % codepoint -+ -+ -+ __escapable = re.compile(r'["&<>]|[^\x00-\x7f]') -+ -+ def escape(self, text): -+ """Replace characters with their character references. -+ -+ Replace characters by their named entity references. -+ Non-ASCII characters, if they do not have a named entity reference, -+ are replaced by numerical character references. -+ -+ The return value is guaranteed to be ASCII. -+ """ -+ return self.__escapable.sub(self.__escape, compat.text_type(text) -+ ).encode('ascii') -+ -+ # XXX: This regexp will not match all valid XML entity names__. -+ # (It punts on details involving involving CombiningChars and Extenders.) -+ # -+ # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef -+ __characterrefs = re.compile(r'''& (?: -+ \#(\d+) -+ | \#x([\da-f]+) -+ | ( (?!\d) [:\w] [-.:\w]+ ) -+ ) ;''', -+ re.X | re.UNICODE) -+ -+ def __unescape(self, m): -+ dval, hval, name = m.groups() -+ if dval: -+ codepoint = int(dval) -+ elif hval: -+ codepoint = int(hval, 16) -+ else: -+ codepoint = self.name2codepoint.get(name, 0xfffd) -+ # U+FFFD = "REPLACEMENT CHARACTER" -+ if codepoint < 128: -+ return chr(codepoint) -+ return chr(codepoint) -+ -+ def unescape(self, text): -+ """Unescape character references. -+ -+ All character references (both entity references and numerical -+ character references) are unescaped. -+ """ -+ return self.__characterrefs.sub(self.__unescape, text) -+ -+ -+_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint) -+ -+html_entities_escape = _html_entities_escaper.escape_entities -+html_entities_unescape = _html_entities_escaper.unescape -+ -+ -+def htmlentityreplace_errors(ex): -+ """An encoding error handler. -+ -+ This python `codecs`_ error handler replaces unencodable -+ characters with HTML entities, or, if no HTML entity exists for -+ the character, XML character references. -+ -+ >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace') -+ 'The cost was €12.' -+ """ -+ if isinstance(ex, UnicodeEncodeError): -+ # Handle encoding errors -+ bad_text = ex.object[ex.start:ex.end] -+ text = _html_entities_escaper.escape(bad_text) -+ return (compat.text_type(text), ex.end) -+ raise ex -+ -+codecs.register_error('htmlentityreplace', htmlentityreplace_errors) -+ -+ -+# TODO: options to make this dynamic per-compilation will be added in a later -+# release -+DEFAULT_ESCAPES = { -+ 'x': 'filters.xml_escape', -+ 'h': 'filters.html_escape', -+ 'u': 'filters.url_escape', -+ 'trim': 'filters.trim', -+ 'entity': 'filters.html_entities_escape', -+ 'unicode': 'unicode', -+ 'decode': 'decode', -+ 'str': 'str', -+ 'n': 'n' -+} -+ -+if compat.py3k: -+ DEFAULT_ESCAPES.update({ -+ 'unicode': 'str' -+ }) -+ -+NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy() -+NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape' -+NON_UNICODE_ESCAPES['u'] = 'filters.legacy_url_escape' -+ -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py -new file mode 100644 -index 0000000..1dda398 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py -@@ -0,0 +1,441 @@ -+# mako/lexer.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+"""provides the Lexer class for parsing template strings into parse trees.""" -+ -+import re -+import codecs -+from mako import parsetree, exceptions, compat -+from mako.pygen import adjust_whitespace -+ -+_regexp_cache = {} -+ -+class Lexer(object): -+ def __init__(self, text, filename=None, -+ disable_unicode=False, -+ input_encoding=None, preprocessor=None): -+ self.text = text -+ self.filename = filename -+ self.template = parsetree.TemplateNode(self.filename) -+ self.matched_lineno = 1 -+ self.matched_charpos = 0 -+ self.lineno = 1 -+ self.match_position = 0 -+ self.tag = [] -+ self.control_line = [] -+ self.ternary_stack = [] -+ self.disable_unicode = disable_unicode -+ self.encoding = input_encoding -+ -+ if compat.py3k and disable_unicode: -+ raise exceptions.UnsupportedError( -+ "Mako for Python 3 does not " -+ "support disabling Unicode") -+ -+ if preprocessor is None: -+ self.preprocessor = [] -+ elif not hasattr(preprocessor, '__iter__'): -+ self.preprocessor = [preprocessor] -+ else: -+ self.preprocessor = preprocessor -+ -+ @property -+ def exception_kwargs(self): -+ return {'source': self.text, -+ 'lineno': self.matched_lineno, -+ 'pos': self.matched_charpos, -+ 'filename': self.filename} -+ -+ def match(self, regexp, flags=None): -+ """compile the given regexp, cache the reg, and call match_reg().""" -+ -+ try: -+ reg = _regexp_cache[(regexp, flags)] -+ except KeyError: -+ if flags: -+ reg = re.compile(regexp, flags) -+ else: -+ reg = re.compile(regexp) -+ _regexp_cache[(regexp, flags)] = reg -+ -+ return self.match_reg(reg) -+ -+ def match_reg(self, reg): -+ """match the given regular expression object to the current text -+ position. -+ -+ if a match occurs, update the current text and line position. -+ -+ """ -+ -+ mp = self.match_position -+ -+ match = reg.match(self.text, self.match_position) -+ if match: -+ (start, end) = match.span() -+ if end == start: -+ self.match_position = end + 1 -+ else: -+ self.match_position = end -+ self.matched_lineno = self.lineno -+ lines = re.findall(r"\n", self.text[mp:self.match_position]) -+ cp = mp - 1 -+ while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'): -+ cp -= 1 -+ self.matched_charpos = mp - cp -+ self.lineno += len(lines) -+ #print "MATCHED:", match.group(0), "LINE START:", -+ # self.matched_lineno, "LINE END:", self.lineno -+ #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \ -+ # (match and "TRUE" or "FALSE") -+ return match -+ -+ def parse_until_text(self, *text): -+ startpos = self.match_position -+ text_re = r'|'.join(text) -+ brace_level = 0 -+ while True: -+ match = self.match(r'#.*\n') -+ if match: -+ continue -+ match = self.match(r'(\"\"\"|\'\'\'|\"|\')((? 0: -+ brace_level -= 1 -+ continue -+ return \ -+ self.text[startpos: -+ self.match_position - len(match.group(1))],\ -+ match.group(1) -+ match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S) -+ if match: -+ brace_level += match.group(1).count('{') -+ brace_level -= match.group(1).count('}') -+ continue -+ raise exceptions.SyntaxException( -+ "Expected: %s" % -+ ','.join(text), -+ **self.exception_kwargs) -+ -+ def append_node(self, nodecls, *args, **kwargs): -+ kwargs.setdefault('source', self.text) -+ kwargs.setdefault('lineno', self.matched_lineno) -+ kwargs.setdefault('pos', self.matched_charpos) -+ kwargs['filename'] = self.filename -+ node = nodecls(*args, **kwargs) -+ if len(self.tag): -+ self.tag[-1].nodes.append(node) -+ else: -+ self.template.nodes.append(node) -+ # build a set of child nodes for the control line -+ # (used for loop variable detection) -+ # also build a set of child nodes on ternary control lines -+ # (used for determining if a pass needs to be auto-inserted -+ if self.control_line: -+ control_frame = self.control_line[-1] -+ control_frame.nodes.append(node) -+ if not (isinstance(node, parsetree.ControlLine) and -+ control_frame.is_ternary(node.keyword)): -+ if self.ternary_stack and self.ternary_stack[-1]: -+ self.ternary_stack[-1][-1].nodes.append(node) -+ if isinstance(node, parsetree.Tag): -+ if len(self.tag): -+ node.parent = self.tag[-1] -+ self.tag.append(node) -+ elif isinstance(node, parsetree.ControlLine): -+ if node.isend: -+ self.control_line.pop() -+ self.ternary_stack.pop() -+ elif node.is_primary: -+ self.control_line.append(node) -+ self.ternary_stack.append([]) -+ elif self.control_line and \ -+ self.control_line[-1].is_ternary(node.keyword): -+ self.ternary_stack[-1].append(node) -+ elif self.control_line and \ -+ not self.control_line[-1].is_ternary(node.keyword): -+ raise exceptions.SyntaxException( -+ "Keyword '%s' not a legal ternary for keyword '%s'" % -+ (node.keyword, self.control_line[-1].keyword), -+ **self.exception_kwargs) -+ -+ _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n') -+ -+ def decode_raw_stream(self, text, decode_raw, known_encoding, filename): -+ """given string/unicode or bytes/string, determine encoding -+ from magic encoding comment, return body as unicode -+ or raw if decode_raw=False -+ -+ """ -+ if isinstance(text, compat.text_type): -+ m = self._coding_re.match(text) -+ encoding = m and m.group(1) or known_encoding or 'ascii' -+ return encoding, text -+ -+ if text.startswith(codecs.BOM_UTF8): -+ text = text[len(codecs.BOM_UTF8):] -+ parsed_encoding = 'utf-8' -+ m = self._coding_re.match(text.decode('utf-8', 'ignore')) -+ if m is not None and m.group(1) != 'utf-8': -+ raise exceptions.CompileException( -+ "Found utf-8 BOM in file, with conflicting " -+ "magic encoding comment of '%s'" % m.group(1), -+ text.decode('utf-8', 'ignore'), -+ 0, 0, filename) -+ else: -+ m = self._coding_re.match(text.decode('utf-8', 'ignore')) -+ if m: -+ parsed_encoding = m.group(1) -+ else: -+ parsed_encoding = known_encoding or 'ascii' -+ -+ if decode_raw: -+ try: -+ text = text.decode(parsed_encoding) -+ except UnicodeDecodeError: -+ raise exceptions.CompileException( -+ "Unicode decode operation of encoding '%s' failed" % -+ parsed_encoding, -+ text.decode('utf-8', 'ignore'), -+ 0, 0, filename) -+ -+ return parsed_encoding, text -+ -+ def parse(self): -+ self.encoding, self.text = self.decode_raw_stream(self.text, -+ not self.disable_unicode, -+ self.encoding, -+ self.filename,) -+ -+ for preproc in self.preprocessor: -+ self.text = preproc(self.text) -+ -+ # push the match marker past the -+ # encoding comment. -+ self.match_reg(self._coding_re) -+ -+ self.textlength = len(self.text) -+ -+ while (True): -+ if self.match_position > self.textlength: -+ break -+ -+ if self.match_end(): -+ break -+ if self.match_expression(): -+ continue -+ if self.match_control_line(): -+ continue -+ if self.match_comment(): -+ continue -+ if self.match_tag_start(): -+ continue -+ if self.match_tag_end(): -+ continue -+ if self.match_python_block(): -+ continue -+ if self.match_text(): -+ continue -+ -+ if self.match_position > self.textlength: -+ break -+ raise exceptions.CompileException("assertion failed") -+ -+ if len(self.tag): -+ raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % -+ self.tag[-1].keyword, -+ **self.exception_kwargs) -+ if len(self.control_line): -+ raise exceptions.SyntaxException( -+ "Unterminated control keyword: '%s'" % -+ self.control_line[-1].keyword, -+ self.text, -+ self.control_line[-1].lineno, -+ self.control_line[-1].pos, self.filename) -+ return self.template -+ -+ def match_tag_start(self): -+ match = self.match(r''' -+ \<% # opening tag -+ -+ ([\w\.\:]+) # keyword -+ -+ ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \ -+ # sign, string expression -+ -+ \s* # more whitespace -+ -+ (/)?> # closing -+ -+ ''', -+ -+ re.I | re.S | re.X) -+ -+ if match: -+ keyword, attr, isend = match.groups() -+ self.keyword = keyword -+ attributes = {} -+ if attr: -+ for att in re.findall( -+ r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr): -+ key, val1, val2 = att -+ text = val1 or val2 -+ text = text.replace('\r\n', '\n') -+ attributes[key] = text -+ self.append_node(parsetree.Tag, keyword, attributes) -+ if isend: -+ self.tag.pop() -+ else: -+ if keyword == 'text': -+ match = self.match(r'(.*?)(?=\)', re.S) -+ if not match: -+ raise exceptions.SyntaxException( -+ "Unclosed tag: <%%%s>" % -+ self.tag[-1].keyword, -+ **self.exception_kwargs) -+ self.append_node(parsetree.Text, match.group(1)) -+ return self.match_tag_end() -+ return True -+ else: -+ return False -+ -+ def match_tag_end(self): -+ match = self.match(r'\') -+ if match: -+ if not len(self.tag): -+ raise exceptions.SyntaxException( -+ "Closing tag without opening tag: " % -+ match.group(1), -+ **self.exception_kwargs) -+ elif self.tag[-1].keyword != match.group(1): -+ raise exceptions.SyntaxException( -+ "Closing tag does not match tag: <%%%s>" % -+ (match.group(1), self.tag[-1].keyword), -+ **self.exception_kwargs) -+ self.tag.pop() -+ return True -+ else: -+ return False -+ -+ def match_end(self): -+ match = self.match(r'\Z', re.S) -+ if match: -+ string = match.group() -+ if string: -+ return string -+ else: -+ return True -+ else: -+ return False -+ -+ def match_text(self): -+ match = self.match(r""" -+ (.*?) # anything, followed by: -+ ( -+ (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based -+ # comment preceded by a -+ # consumed newline and whitespace -+ | -+ (?=\${) # an expression -+ | -+ (?=') -+ # the trailing newline helps -+ # compiler.parse() not complain about indentation -+ text = adjust_whitespace(text) + "\n" -+ self.append_node( -+ parsetree.Code, -+ text, -+ match.group(1) == '!', lineno=line, pos=pos) -+ return True -+ else: -+ return False -+ -+ def match_expression(self): -+ match = self.match(r"\${") -+ if match: -+ line, pos = self.matched_lineno, self.matched_charpos -+ text, end = self.parse_until_text(r'\|', r'}') -+ if end == '|': -+ escapes, end = self.parse_until_text(r'}') -+ else: -+ escapes = "" -+ text = text.replace('\r\n', '\n') -+ self.append_node( -+ parsetree.Expression, -+ text, escapes.strip(), -+ lineno=line, pos=pos) -+ return True -+ else: -+ return False -+ -+ def match_control_line(self): -+ match = self.match( -+ r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)" -+ r"(?:\r?\n|\Z)", re.M) -+ if match: -+ operator = match.group(1) -+ text = match.group(2) -+ if operator == '%': -+ m2 = re.match(r'(end)?(\w+)\s*(.*)', text) -+ if not m2: -+ raise exceptions.SyntaxException( -+ "Invalid control line: '%s'" % -+ text, -+ **self.exception_kwargs) -+ isend, keyword = m2.group(1, 2) -+ isend = (isend is not None) -+ -+ if isend: -+ if not len(self.control_line): -+ raise exceptions.SyntaxException( -+ "No starting keyword '%s' for '%s'" % -+ (keyword, text), -+ **self.exception_kwargs) -+ elif self.control_line[-1].keyword != keyword: -+ raise exceptions.SyntaxException( -+ "Keyword '%s' doesn't match keyword '%s'" % -+ (text, self.control_line[-1].keyword), -+ **self.exception_kwargs) -+ self.append_node(parsetree.ControlLine, keyword, isend, text) -+ else: -+ self.append_node(parsetree.Comment, text) -+ return True -+ else: -+ return False -+ -+ def match_comment(self): -+ """matches the multiline version of a comment""" -+ match = self.match(r"<%doc>(.*?)", re.S) -+ if match: -+ self.append_node(parsetree.Comment, match.group(1)) -+ return True -+ else: -+ return False -+ -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py -new file mode 100644 -index 0000000..2af5411 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py -@@ -0,0 +1,359 @@ -+# mako/lookup.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+import os, stat, posixpath, re -+from mako import exceptions, util -+from mako.template import Template -+ -+try: -+ import threading -+except: -+ import dummy_threading as threading -+ -+class TemplateCollection(object): -+ """Represent a collection of :class:`.Template` objects, -+ identifiable via URI. -+ -+ A :class:`.TemplateCollection` is linked to the usage of -+ all template tags that address other templates, such -+ as ``<%include>``, ``<%namespace>``, and ``<%inherit>``. -+ The ``file`` attribute of each of those tags refers -+ to a string URI that is passed to that :class:`.Template` -+ object's :class:`.TemplateCollection` for resolution. -+ -+ :class:`.TemplateCollection` is an abstract class, -+ with the usual default implementation being :class:`.TemplateLookup`. -+ -+ """ -+ -+ def has_template(self, uri): -+ """Return ``True`` if this :class:`.TemplateLookup` is -+ capable of returning a :class:`.Template` object for the -+ given ``uri``. -+ -+ :param uri: String URI of the template to be resolved. -+ -+ """ -+ try: -+ self.get_template(uri) -+ return True -+ except exceptions.TemplateLookupException: -+ return False -+ -+ def get_template(self, uri, relativeto=None): -+ """Return a :class:`.Template` object corresponding to the given -+ ``uri``. -+ -+ The default implementation raises -+ :class:`.NotImplementedError`. Implementations should -+ raise :class:`.TemplateLookupException` if the given ``uri`` -+ cannot be resolved. -+ -+ :param uri: String URI of the template to be resolved. -+ :param relativeto: if present, the given ``uri`` is assumed to -+ be relative to this URI. -+ -+ """ -+ raise NotImplementedError() -+ -+ def filename_to_uri(self, uri, filename): -+ """Convert the given ``filename`` to a URI relative to -+ this :class:`.TemplateCollection`.""" -+ -+ return uri -+ -+ def adjust_uri(self, uri, filename): -+ """Adjust the given ``uri`` based on the calling ``filename``. -+ -+ When this method is called from the runtime, the -+ ``filename`` parameter is taken directly to the ``filename`` -+ attribute of the calling template. Therefore a custom -+ :class:`.TemplateCollection` subclass can place any string -+ identifier desired in the ``filename`` parameter of the -+ :class:`.Template` objects it constructs and have them come back -+ here. -+ -+ """ -+ return uri -+ -+class TemplateLookup(TemplateCollection): -+ """Represent a collection of templates that locates template source files -+ from the local filesystem. -+ -+ The primary argument is the ``directories`` argument, the list of -+ directories to search: -+ -+ .. sourcecode:: python -+ -+ lookup = TemplateLookup(["/path/to/templates"]) -+ some_template = lookup.get_template("/index.html") -+ -+ The :class:`.TemplateLookup` can also be given :class:`.Template` objects -+ programatically using :meth:`.put_string` or :meth:`.put_template`: -+ -+ .. sourcecode:: python -+ -+ lookup = TemplateLookup() -+ lookup.put_string("base.html", ''' -+ ${self.next()} -+ ''') -+ lookup.put_string("hello.html", ''' -+ <%include file='base.html'/> -+ -+ Hello, world ! -+ ''') -+ -+ -+ :param directories: A list of directory names which will be -+ searched for a particular template URI. The URI is appended -+ to each directory and the filesystem checked. -+ -+ :param collection_size: Approximate size of the collection used -+ to store templates. If left at its default of ``-1``, the size -+ is unbounded, and a plain Python dictionary is used to -+ relate URI strings to :class:`.Template` instances. -+ Otherwise, a least-recently-used cache object is used which -+ will maintain the size of the collection approximately to -+ the number given. -+ -+ :param filesystem_checks: When at its default value of ``True``, -+ each call to :meth:`.TemplateLookup.get_template()` will -+ compare the filesystem last modified time to the time in -+ which an existing :class:`.Template` object was created. -+ This allows the :class:`.TemplateLookup` to regenerate a -+ new :class:`.Template` whenever the original source has -+ been updated. Set this to ``False`` for a very minor -+ performance increase. -+ -+ :param modulename_callable: A callable which, when present, -+ is passed the path of the source file as well as the -+ requested URI, and then returns the full path of the -+ generated Python module file. This is used to inject -+ alternate schemes for Python module location. If left at -+ its default of ``None``, the built in system of generation -+ based on ``module_directory`` plus ``uri`` is used. -+ -+ All other keyword parameters available for -+ :class:`.Template` are mirrored here. When new -+ :class:`.Template` objects are created, the keywords -+ established with this :class:`.TemplateLookup` are passed on -+ to each new :class:`.Template`. -+ -+ """ -+ -+ def __init__(self, -+ directories=None, -+ module_directory=None, -+ filesystem_checks=True, -+ collection_size=-1, -+ format_exceptions=False, -+ error_handler=None, -+ disable_unicode=False, -+ bytestring_passthrough=False, -+ output_encoding=None, -+ encoding_errors='strict', -+ -+ cache_args=None, -+ cache_impl='beaker', -+ cache_enabled=True, -+ cache_type=None, -+ cache_dir=None, -+ cache_url=None, -+ -+ modulename_callable=None, -+ module_writer=None, -+ default_filters=None, -+ buffer_filters=(), -+ strict_undefined=False, -+ imports=None, -+ future_imports=None, -+ enable_loop=True, -+ input_encoding=None, -+ preprocessor=None, -+ lexer_cls=None): -+ -+ self.directories = [posixpath.normpath(d) for d in -+ util.to_list(directories, ()) -+ ] -+ self.module_directory = module_directory -+ self.modulename_callable = modulename_callable -+ self.filesystem_checks = filesystem_checks -+ self.collection_size = collection_size -+ -+ if cache_args is None: -+ cache_args = {} -+ # transfer deprecated cache_* args -+ if cache_dir: -+ cache_args.setdefault('dir', cache_dir) -+ if cache_url: -+ cache_args.setdefault('url', cache_url) -+ if cache_type: -+ cache_args.setdefault('type', cache_type) -+ -+ self.template_args = { -+ 'format_exceptions':format_exceptions, -+ 'error_handler':error_handler, -+ 'disable_unicode':disable_unicode, -+ 'bytestring_passthrough':bytestring_passthrough, -+ 'output_encoding':output_encoding, -+ 'cache_impl':cache_impl, -+ 'encoding_errors':encoding_errors, -+ 'input_encoding':input_encoding, -+ 'module_directory':module_directory, -+ 'module_writer':module_writer, -+ 'cache_args':cache_args, -+ 'cache_enabled':cache_enabled, -+ 'default_filters':default_filters, -+ 'buffer_filters':buffer_filters, -+ 'strict_undefined':strict_undefined, -+ 'imports':imports, -+ 'future_imports':future_imports, -+ 'enable_loop':enable_loop, -+ 'preprocessor':preprocessor, -+ 'lexer_cls':lexer_cls -+ } -+ -+ if collection_size == -1: -+ self._collection = {} -+ self._uri_cache = {} -+ else: -+ self._collection = util.LRUCache(collection_size) -+ self._uri_cache = util.LRUCache(collection_size) -+ self._mutex = threading.Lock() -+ -+ def get_template(self, uri): -+ """Return a :class:`.Template` object corresponding to the given -+ ``uri``. -+ -+ .. note:: The ``relativeto`` argument is not supported here at the moment. -+ -+ """ -+ -+ try: -+ if self.filesystem_checks: -+ return self._check(uri, self._collection[uri]) -+ else: -+ return self._collection[uri] -+ except KeyError: -+ u = re.sub(r'^\/+', '', uri) -+ for dir in self.directories: -+ srcfile = posixpath.normpath(posixpath.join(dir, u)) -+ if os.path.isfile(srcfile): -+ return self._load(srcfile, uri) -+ else: -+ raise exceptions.TopLevelLookupException( -+ "Cant locate template for uri %r" % uri) -+ -+ def adjust_uri(self, uri, relativeto): -+ """Adjust the given ``uri`` based on the given relative URI.""" -+ -+ key = (uri, relativeto) -+ if key in self._uri_cache: -+ return self._uri_cache[key] -+ -+ if uri[0] != '/': -+ if relativeto is not None: -+ v = self._uri_cache[key] = posixpath.join( -+ posixpath.dirname(relativeto), uri) -+ else: -+ v = self._uri_cache[key] = '/' + uri -+ else: -+ v = self._uri_cache[key] = uri -+ return v -+ -+ -+ def filename_to_uri(self, filename): -+ """Convert the given ``filename`` to a URI relative to -+ this :class:`.TemplateCollection`.""" -+ -+ try: -+ return self._uri_cache[filename] -+ except KeyError: -+ value = self._relativeize(filename) -+ self._uri_cache[filename] = value -+ return value -+ -+ def _relativeize(self, filename): -+ """Return the portion of a filename that is 'relative' -+ to the directories in this lookup. -+ -+ """ -+ -+ filename = posixpath.normpath(filename) -+ for dir in self.directories: -+ if filename[0:len(dir)] == dir: -+ return filename[len(dir):] -+ else: -+ return None -+ -+ def _load(self, filename, uri): -+ self._mutex.acquire() -+ try: -+ try: -+ # try returning from collection one -+ # more time in case concurrent thread already loaded -+ return self._collection[uri] -+ except KeyError: -+ pass -+ try: -+ if self.modulename_callable is not None: -+ module_filename = self.modulename_callable(filename, uri) -+ else: -+ module_filename = None -+ self._collection[uri] = template = Template( -+ uri=uri, -+ filename=posixpath.normpath(filename), -+ lookup=self, -+ module_filename=module_filename, -+ **self.template_args) -+ return template -+ except: -+ # if compilation fails etc, ensure -+ # template is removed from collection, -+ # re-raise -+ self._collection.pop(uri, None) -+ raise -+ finally: -+ self._mutex.release() -+ -+ def _check(self, uri, template): -+ if template.filename is None: -+ return template -+ -+ try: -+ template_stat = os.stat(template.filename) -+ if template.module._modified_time < \ -+ template_stat[stat.ST_MTIME]: -+ self._collection.pop(uri, None) -+ return self._load(template.filename, uri) -+ else: -+ return template -+ except OSError: -+ self._collection.pop(uri, None) -+ raise exceptions.TemplateLookupException( -+ "Cant locate template for uri %r" % uri) -+ -+ -+ def put_string(self, uri, text): -+ """Place a new :class:`.Template` object into this -+ :class:`.TemplateLookup`, based on the given string of -+ ``text``. -+ -+ """ -+ self._collection[uri] = Template( -+ text, -+ lookup=self, -+ uri=uri, -+ **self.template_args) -+ -+ def put_template(self, uri, template): -+ """Place a new :class:`.Template` object into this -+ :class:`.TemplateLookup`, based on the given -+ :class:`.Template` object. -+ -+ """ -+ self._collection[uri] = template -+ -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py -new file mode 100644 -index 0000000..49ec4e0 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py -@@ -0,0 +1,594 @@ -+# mako/parsetree.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+"""defines the parse tree components for Mako templates.""" -+ -+from mako import exceptions, ast, util, filters, compat -+import re -+ -+class Node(object): -+ """base class for a Node in the parse tree.""" -+ -+ def __init__(self, source, lineno, pos, filename): -+ self.source = source -+ self.lineno = lineno -+ self.pos = pos -+ self.filename = filename -+ -+ @property -+ def exception_kwargs(self): -+ return {'source': self.source, 'lineno': self.lineno, -+ 'pos': self.pos, 'filename': self.filename} -+ -+ def get_children(self): -+ return [] -+ -+ def accept_visitor(self, visitor): -+ def traverse(node): -+ for n in node.get_children(): -+ n.accept_visitor(visitor) -+ -+ method = getattr(visitor, "visit" + self.__class__.__name__, traverse) -+ method(self) -+ -+class TemplateNode(Node): -+ """a 'container' node that stores the overall collection of nodes.""" -+ -+ def __init__(self, filename): -+ super(TemplateNode, self).__init__('', 0, 0, filename) -+ self.nodes = [] -+ self.page_attributes = {} -+ -+ def get_children(self): -+ return self.nodes -+ -+ def __repr__(self): -+ return "TemplateNode(%s, %r)" % ( -+ util.sorted_dict_repr(self.page_attributes), -+ self.nodes) -+ -+class ControlLine(Node): -+ """defines a control line, a line-oriented python line or end tag. -+ -+ e.g.:: -+ -+ % if foo: -+ (markup) -+ % endif -+ -+ """ -+ -+ has_loop_context = False -+ -+ def __init__(self, keyword, isend, text, **kwargs): -+ super(ControlLine, self).__init__(**kwargs) -+ self.text = text -+ self.keyword = keyword -+ self.isend = isend -+ self.is_primary = keyword in ['for', 'if', 'while', 'try', 'with'] -+ self.nodes = [] -+ if self.isend: -+ self._declared_identifiers = [] -+ self._undeclared_identifiers = [] -+ else: -+ code = ast.PythonFragment(text, **self.exception_kwargs) -+ self._declared_identifiers = code.declared_identifiers -+ self._undeclared_identifiers = code.undeclared_identifiers -+ -+ def get_children(self): -+ return self.nodes -+ -+ def declared_identifiers(self): -+ return self._declared_identifiers -+ -+ def undeclared_identifiers(self): -+ return self._undeclared_identifiers -+ -+ def is_ternary(self, keyword): -+ """return true if the given keyword is a ternary keyword -+ for this ControlLine""" -+ -+ return keyword in { -+ 'if':set(['else', 'elif']), -+ 'try':set(['except', 'finally']), -+ 'for':set(['else']) -+ }.get(self.keyword, []) -+ -+ def __repr__(self): -+ return "ControlLine(%r, %r, %r, %r)" % ( -+ self.keyword, -+ self.text, -+ self.isend, -+ (self.lineno, self.pos) -+ ) -+ -+class Text(Node): -+ """defines plain text in the template.""" -+ -+ def __init__(self, content, **kwargs): -+ super(Text, self).__init__(**kwargs) -+ self.content = content -+ -+ def __repr__(self): -+ return "Text(%r, %r)" % (self.content, (self.lineno, self.pos)) -+ -+class Code(Node): -+ """defines a Python code block, either inline or module level. -+ -+ e.g.:: -+ -+ inline: -+ <% -+ x = 12 -+ %> -+ -+ module level: -+ <%! -+ import logger -+ %> -+ -+ """ -+ -+ def __init__(self, text, ismodule, **kwargs): -+ super(Code, self).__init__(**kwargs) -+ self.text = text -+ self.ismodule = ismodule -+ self.code = ast.PythonCode(text, **self.exception_kwargs) -+ -+ def declared_identifiers(self): -+ return self.code.declared_identifiers -+ -+ def undeclared_identifiers(self): -+ return self.code.undeclared_identifiers -+ -+ def __repr__(self): -+ return "Code(%r, %r, %r)" % ( -+ self.text, -+ self.ismodule, -+ (self.lineno, self.pos) -+ ) -+ -+class Comment(Node): -+ """defines a comment line. -+ -+ # this is a comment -+ -+ """ -+ -+ def __init__(self, text, **kwargs): -+ super(Comment, self).__init__(**kwargs) -+ self.text = text -+ -+ def __repr__(self): -+ return "Comment(%r, %r)" % (self.text, (self.lineno, self.pos)) -+ -+class Expression(Node): -+ """defines an inline expression. -+ -+ ${x+y} -+ -+ """ -+ -+ def __init__(self, text, escapes, **kwargs): -+ super(Expression, self).__init__(**kwargs) -+ self.text = text -+ self.escapes = escapes -+ self.escapes_code = ast.ArgumentList(escapes, **self.exception_kwargs) -+ self.code = ast.PythonCode(text, **self.exception_kwargs) -+ -+ def declared_identifiers(self): -+ return [] -+ -+ def undeclared_identifiers(self): -+ # TODO: make the "filter" shortcut list configurable at parse/gen time -+ return self.code.undeclared_identifiers.union( -+ self.escapes_code.undeclared_identifiers.difference( -+ set(filters.DEFAULT_ESCAPES.keys()) -+ ) -+ ).difference(self.code.declared_identifiers) -+ -+ def __repr__(self): -+ return "Expression(%r, %r, %r)" % ( -+ self.text, -+ self.escapes_code.args, -+ (self.lineno, self.pos) -+ ) -+ -+class _TagMeta(type): -+ """metaclass to allow Tag to produce a subclass according to -+ its keyword""" -+ -+ _classmap = {} -+ -+ def __init__(cls, clsname, bases, dict): -+ if getattr(cls, '__keyword__', None) is not None: -+ cls._classmap[cls.__keyword__] = cls -+ super(_TagMeta, cls).__init__(clsname, bases, dict) -+ -+ def __call__(cls, keyword, attributes, **kwargs): -+ if ":" in keyword: -+ ns, defname = keyword.split(':') -+ return type.__call__(CallNamespaceTag, ns, defname, -+ attributes, **kwargs) -+ -+ try: -+ cls = _TagMeta._classmap[keyword] -+ except KeyError: -+ raise exceptions.CompileException( -+ "No such tag: '%s'" % keyword, -+ source=kwargs['source'], -+ lineno=kwargs['lineno'], -+ pos=kwargs['pos'], -+ filename=kwargs['filename'] -+ ) -+ return type.__call__(cls, keyword, attributes, **kwargs) -+ -+class Tag(compat.with_metaclass(_TagMeta, Node)): -+ """abstract base class for tags. -+ -+ <%sometag/> -+ -+ <%someothertag> -+ stuff -+ -+ -+ """ -+ __keyword__ = None -+ -+ def __init__(self, keyword, attributes, expressions, -+ nonexpressions, required, **kwargs): -+ """construct a new Tag instance. -+ -+ this constructor not called directly, and is only called -+ by subclasses. -+ -+ :param keyword: the tag keyword -+ -+ :param attributes: raw dictionary of attribute key/value pairs -+ -+ :param expressions: a set of identifiers that are legal attributes, -+ which can also contain embedded expressions -+ -+ :param nonexpressions: a set of identifiers that are legal -+ attributes, which cannot contain embedded expressions -+ -+ :param \**kwargs: -+ other arguments passed to the Node superclass (lineno, pos) -+ -+ """ -+ super(Tag, self).__init__(**kwargs) -+ self.keyword = keyword -+ self.attributes = attributes -+ self._parse_attributes(expressions, nonexpressions) -+ missing = [r for r in required if r not in self.parsed_attributes] -+ if len(missing): -+ raise exceptions.CompileException( -+ "Missing attribute(s): %s" % -+ ",".join([repr(m) for m in missing]), -+ **self.exception_kwargs) -+ self.parent = None -+ self.nodes = [] -+ -+ def is_root(self): -+ return self.parent is None -+ -+ def get_children(self): -+ return self.nodes -+ -+ def _parse_attributes(self, expressions, nonexpressions): -+ undeclared_identifiers = set() -+ self.parsed_attributes = {} -+ for key in self.attributes: -+ if key in expressions: -+ expr = [] -+ for x in re.compile(r'(\${.+?})', -+ re.S).split(self.attributes[key]): -+ m = re.compile(r'^\${(.+?)}$', re.S).match(x) -+ if m: -+ code = ast.PythonCode(m.group(1).rstrip(), -+ **self.exception_kwargs) -+ # we aren't discarding "declared_identifiers" here, -+ # which we do so that list comprehension-declared -+ # variables aren't counted. As yet can't find a -+ # condition that requires it here. -+ undeclared_identifiers = \ -+ undeclared_identifiers.union( -+ code.undeclared_identifiers) -+ expr.append('(%s)' % m.group(1)) -+ else: -+ if x: -+ expr.append(repr(x)) -+ self.parsed_attributes[key] = " + ".join(expr) or repr('') -+ elif key in nonexpressions: -+ if re.search(r'\${.+?}', self.attributes[key]): -+ raise exceptions.CompileException( -+ "Attibute '%s' in tag '%s' does not allow embedded " -+ "expressions" % (key, self.keyword), -+ **self.exception_kwargs) -+ self.parsed_attributes[key] = repr(self.attributes[key]) -+ else: -+ raise exceptions.CompileException( -+ "Invalid attribute for tag '%s': '%s'" % -+ (self.keyword, key), -+ **self.exception_kwargs) -+ self.expression_undeclared_identifiers = undeclared_identifiers -+ -+ def declared_identifiers(self): -+ return [] -+ -+ def undeclared_identifiers(self): -+ return self.expression_undeclared_identifiers -+ -+ def __repr__(self): -+ return "%s(%r, %s, %r, %r)" % (self.__class__.__name__, -+ self.keyword, -+ util.sorted_dict_repr(self.attributes), -+ (self.lineno, self.pos), -+ self.nodes -+ ) -+ -+class IncludeTag(Tag): -+ __keyword__ = 'include' -+ -+ def __init__(self, keyword, attributes, **kwargs): -+ super(IncludeTag, self).__init__( -+ keyword, -+ attributes, -+ ('file', 'import', 'args'), -+ (), ('file',), **kwargs) -+ self.page_args = ast.PythonCode( -+ "__DUMMY(%s)" % attributes.get('args', ''), -+ **self.exception_kwargs) -+ -+ def declared_identifiers(self): -+ return [] -+ -+ def undeclared_identifiers(self): -+ identifiers = self.page_args.undeclared_identifiers.\ -+ difference(set(["__DUMMY"])).\ -+ difference(self.page_args.declared_identifiers) -+ return identifiers.union(super(IncludeTag, self). -+ undeclared_identifiers()) -+ -+class NamespaceTag(Tag): -+ __keyword__ = 'namespace' -+ -+ def __init__(self, keyword, attributes, **kwargs): -+ super(NamespaceTag, self).__init__( -+ keyword, attributes, -+ ('file',), -+ ('name','inheritable', -+ 'import','module'), -+ (), **kwargs) -+ -+ self.name = attributes.get('name', '__anon_%s' % hex(abs(id(self)))) -+ if not 'name' in attributes and not 'import' in attributes: -+ raise exceptions.CompileException( -+ "'name' and/or 'import' attributes are required " -+ "for <%namespace>", -+ **self.exception_kwargs) -+ if 'file' in attributes and 'module' in attributes: -+ raise exceptions.CompileException( -+ "<%namespace> may only have one of 'file' or 'module'", -+ **self.exception_kwargs -+ ) -+ -+ def declared_identifiers(self): -+ return [] -+ -+class TextTag(Tag): -+ __keyword__ = 'text' -+ -+ def __init__(self, keyword, attributes, **kwargs): -+ super(TextTag, self).__init__( -+ keyword, -+ attributes, (), -+ ('filter'), (), **kwargs) -+ self.filter_args = ast.ArgumentList( -+ attributes.get('filter', ''), -+ **self.exception_kwargs) -+ -+ def undeclared_identifiers(self): -+ return self.filter_args.\ -+ undeclared_identifiers.\ -+ difference(filters.DEFAULT_ESCAPES.keys()).union( -+ self.expression_undeclared_identifiers -+ ) -+ -+class DefTag(Tag): -+ __keyword__ = 'def' -+ -+ def __init__(self, keyword, attributes, **kwargs): -+ expressions = ['buffered', 'cached'] + [ -+ c for c in attributes if c.startswith('cache_')] -+ -+ -+ super(DefTag, self).__init__( -+ keyword, -+ attributes, -+ expressions, -+ ('name', 'filter', 'decorator'), -+ ('name',), -+ **kwargs) -+ name = attributes['name'] -+ if re.match(r'^[\w_]+$', name): -+ raise exceptions.CompileException( -+ "Missing parenthesis in %def", -+ **self.exception_kwargs) -+ self.function_decl = ast.FunctionDecl("def " + name + ":pass", -+ **self.exception_kwargs) -+ self.name = self.function_decl.funcname -+ self.decorator = attributes.get('decorator', '') -+ self.filter_args = ast.ArgumentList( -+ attributes.get('filter', ''), -+ **self.exception_kwargs) -+ -+ is_anonymous = False -+ is_block = False -+ -+ @property -+ def funcname(self): -+ return self.function_decl.funcname -+ -+ def get_argument_expressions(self, **kw): -+ return self.function_decl.get_argument_expressions(**kw) -+ -+ def declared_identifiers(self): -+ return self.function_decl.allargnames -+ -+ def undeclared_identifiers(self): -+ res = [] -+ for c in self.function_decl.defaults: -+ res += list(ast.PythonCode(c, **self.exception_kwargs). -+ undeclared_identifiers) -+ return set(res).union( -+ self.filter_args.\ -+ undeclared_identifiers.\ -+ difference(filters.DEFAULT_ESCAPES.keys()) -+ ).union( -+ self.expression_undeclared_identifiers -+ ).difference( -+ self.function_decl.allargnames -+ ) -+ -+class BlockTag(Tag): -+ __keyword__ = 'block' -+ -+ def __init__(self, keyword, attributes, **kwargs): -+ expressions = ['buffered', 'cached', 'args'] + [ -+ c for c in attributes if c.startswith('cache_')] -+ -+ super(BlockTag, self).__init__( -+ keyword, -+ attributes, -+ expressions, -+ ('name','filter', 'decorator'), -+ (), -+ **kwargs) -+ name = attributes.get('name') -+ if name and not re.match(r'^[\w_]+$',name): -+ raise exceptions.CompileException( -+ "%block may not specify an argument signature", -+ **self.exception_kwargs) -+ if not name and attributes.get('args', None): -+ raise exceptions.CompileException( -+ "Only named %blocks may specify args", -+ **self.exception_kwargs -+ ) -+ self.body_decl = ast.FunctionArgs(attributes.get('args', ''), -+ **self.exception_kwargs) -+ -+ self.name = name -+ self.decorator = attributes.get('decorator', '') -+ self.filter_args = ast.ArgumentList( -+ attributes.get('filter', ''), -+ **self.exception_kwargs) -+ -+ -+ is_block = True -+ -+ @property -+ def is_anonymous(self): -+ return self.name is None -+ -+ @property -+ def funcname(self): -+ return self.name or "__M_anon_%d" % (self.lineno, ) -+ -+ def get_argument_expressions(self, **kw): -+ return self.body_decl.get_argument_expressions(**kw) -+ -+ def declared_identifiers(self): -+ return self.body_decl.allargnames -+ -+ def undeclared_identifiers(self): -+ return (self.filter_args.\ -+ undeclared_identifiers.\ -+ difference(filters.DEFAULT_ESCAPES.keys()) -+ ).union(self.expression_undeclared_identifiers) -+ -+ -+ -+class CallTag(Tag): -+ __keyword__ = 'call' -+ -+ def __init__(self, keyword, attributes, **kwargs): -+ super(CallTag, self).__init__(keyword, attributes, -+ ('args'), ('expr',), ('expr',), **kwargs) -+ self.expression = attributes['expr'] -+ self.code = ast.PythonCode(self.expression, **self.exception_kwargs) -+ self.body_decl = ast.FunctionArgs(attributes.get('args', ''), -+ **self.exception_kwargs) -+ -+ def declared_identifiers(self): -+ return self.code.declared_identifiers.union(self.body_decl.allargnames) -+ -+ def undeclared_identifiers(self): -+ return self.code.undeclared_identifiers.\ -+ difference(self.code.declared_identifiers) -+ -+class CallNamespaceTag(Tag): -+ -+ def __init__(self, namespace, defname, attributes, **kwargs): -+ super(CallNamespaceTag, self).__init__( -+ namespace + ":" + defname, -+ attributes, -+ tuple(attributes.keys()) + ('args', ), -+ (), -+ (), -+ **kwargs) -+ -+ self.expression = "%s.%s(%s)" % ( -+ namespace, -+ defname, -+ ",".join(["%s=%s" % (k, v) for k, v in -+ self.parsed_attributes.items() -+ if k != 'args']) -+ ) -+ self.code = ast.PythonCode(self.expression, **self.exception_kwargs) -+ self.body_decl = ast.FunctionArgs( -+ attributes.get('args', ''), -+ **self.exception_kwargs) -+ -+ def declared_identifiers(self): -+ return self.code.declared_identifiers.union(self.body_decl.allargnames) -+ -+ def undeclared_identifiers(self): -+ return self.code.undeclared_identifiers.\ -+ difference(self.code.declared_identifiers) -+ -+class InheritTag(Tag): -+ __keyword__ = 'inherit' -+ -+ def __init__(self, keyword, attributes, **kwargs): -+ super(InheritTag, self).__init__( -+ keyword, attributes, -+ ('file',), (), ('file',), **kwargs) -+ -+class PageTag(Tag): -+ __keyword__ = 'page' -+ -+ def __init__(self, keyword, attributes, **kwargs): -+ expressions = ['cached', 'args', 'expression_filter', 'enable_loop'] + [ -+ c for c in attributes if c.startswith('cache_')] -+ -+ super(PageTag, self).__init__( -+ keyword, -+ attributes, -+ expressions, -+ (), -+ (), -+ **kwargs) -+ self.body_decl = ast.FunctionArgs(attributes.get('args', ''), -+ **self.exception_kwargs) -+ self.filter_args = ast.ArgumentList( -+ attributes.get('expression_filter', ''), -+ **self.exception_kwargs) -+ -+ def declared_identifiers(self): -+ return self.body_decl.allargnames -+ -+ -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py -new file mode 100644 -index 0000000..5ba5125 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py -@@ -0,0 +1,299 @@ -+# mako/pygen.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+"""utilities for generating and formatting literal Python code.""" -+ -+import re -+from mako import exceptions -+ -+class PythonPrinter(object): -+ def __init__(self, stream): -+ # indentation counter -+ self.indent = 0 -+ -+ # a stack storing information about why we incremented -+ # the indentation counter, to help us determine if we -+ # should decrement it -+ self.indent_detail = [] -+ -+ # the string of whitespace multiplied by the indent -+ # counter to produce a line -+ self.indentstring = " " -+ -+ # the stream we are writing to -+ self.stream = stream -+ -+ # current line number -+ self.lineno = 1 -+ -+ # a list of lines that represents a buffered "block" of code, -+ # which can be later printed relative to an indent level -+ self.line_buffer = [] -+ -+ self.in_indent_lines = False -+ -+ self._reset_multi_line_flags() -+ -+ # mapping of generated python lines to template -+ # source lines -+ self.source_map = {} -+ -+ def _update_lineno(self, num): -+ self.lineno += num -+ -+ def start_source(self, lineno): -+ if self.lineno not in self.source_map: -+ self.source_map[self.lineno] = lineno -+ -+ def write_blanks(self, num): -+ self.stream.write("\n" * num) -+ self._update_lineno(num) -+ -+ def write_indented_block(self, block): -+ """print a line or lines of python which already contain indentation. -+ -+ The indentation of the total block of lines will be adjusted to that of -+ the current indent level.""" -+ self.in_indent_lines = False -+ for l in re.split(r'\r?\n', block): -+ self.line_buffer.append(l) -+ self._update_lineno(1) -+ -+ def writelines(self, *lines): -+ """print a series of lines of python.""" -+ for line in lines: -+ self.writeline(line) -+ -+ def writeline(self, line): -+ """print a line of python, indenting it according to the current -+ indent level. -+ -+ this also adjusts the indentation counter according to the -+ content of the line. -+ -+ """ -+ -+ if not self.in_indent_lines: -+ self._flush_adjusted_lines() -+ self.in_indent_lines = True -+ -+ if (line is None or -+ re.match(r"^\s*#",line) or -+ re.match(r"^\s*$", line) -+ ): -+ hastext = False -+ else: -+ hastext = True -+ -+ is_comment = line and len(line) and line[0] == '#' -+ -+ # see if this line should decrease the indentation level -+ if (not is_comment and -+ (not hastext or self._is_unindentor(line)) -+ ): -+ -+ if self.indent > 0: -+ self.indent -= 1 -+ # if the indent_detail stack is empty, the user -+ # probably put extra closures - the resulting -+ # module wont compile. -+ if len(self.indent_detail) == 0: -+ raise exceptions.SyntaxException( -+ "Too many whitespace closures") -+ self.indent_detail.pop() -+ -+ if line is None: -+ return -+ -+ # write the line -+ self.stream.write(self._indent_line(line) + "\n") -+ self._update_lineno(len(line.split("\n"))) -+ -+ # see if this line should increase the indentation level. -+ # note that a line can both decrase (before printing) and -+ # then increase (after printing) the indentation level. -+ -+ if re.search(r":[ \t]*(?:#.*)?$", line): -+ # increment indentation count, and also -+ # keep track of what the keyword was that indented us, -+ # if it is a python compound statement keyword -+ # where we might have to look for an "unindent" keyword -+ match = re.match(r"^\s*(if|try|elif|while|for|with)", line) -+ if match: -+ # its a "compound" keyword, so we will check for "unindentors" -+ indentor = match.group(1) -+ self.indent += 1 -+ self.indent_detail.append(indentor) -+ else: -+ indentor = None -+ # its not a "compound" keyword. but lets also -+ # test for valid Python keywords that might be indenting us, -+ # else assume its a non-indenting line -+ m2 = re.match(r"^\s*(def|class|else|elif|except|finally)", -+ line) -+ if m2: -+ self.indent += 1 -+ self.indent_detail.append(indentor) -+ -+ def close(self): -+ """close this printer, flushing any remaining lines.""" -+ self._flush_adjusted_lines() -+ -+ def _is_unindentor(self, line): -+ """return true if the given line is an 'unindentor', -+ relative to the last 'indent' event received. -+ -+ """ -+ -+ # no indentation detail has been pushed on; return False -+ if len(self.indent_detail) == 0: -+ return False -+ -+ indentor = self.indent_detail[-1] -+ -+ # the last indent keyword we grabbed is not a -+ # compound statement keyword; return False -+ if indentor is None: -+ return False -+ -+ # if the current line doesnt have one of the "unindentor" keywords, -+ # return False -+ match = re.match(r"^\s*(else|elif|except|finally).*\:", line) -+ if not match: -+ return False -+ -+ # whitespace matches up, we have a compound indentor, -+ # and this line has an unindentor, this -+ # is probably good enough -+ return True -+ -+ # should we decide that its not good enough, heres -+ # more stuff to check. -+ #keyword = match.group(1) -+ -+ # match the original indent keyword -+ #for crit in [ -+ # (r'if|elif', r'else|elif'), -+ # (r'try', r'except|finally|else'), -+ # (r'while|for', r'else'), -+ #]: -+ # if re.match(crit[0], indentor) and re.match(crit[1], keyword): -+ # return True -+ -+ #return False -+ -+ def _indent_line(self, line, stripspace=''): -+ """indent the given line according to the current indent level. -+ -+ stripspace is a string of space that will be truncated from the -+ start of the line before indenting.""" -+ -+ return re.sub(r"^%s" % stripspace, self.indentstring -+ * self.indent, line) -+ -+ def _reset_multi_line_flags(self): -+ """reset the flags which would indicate we are in a backslashed -+ or triple-quoted section.""" -+ -+ self.backslashed, self.triplequoted = False, False -+ -+ def _in_multi_line(self, line): -+ """return true if the given line is part of a multi-line block, -+ via backslash or triple-quote.""" -+ -+ # we are only looking for explicitly joined lines here, not -+ # implicit ones (i.e. brackets, braces etc.). this is just to -+ # guard against the possibility of modifying the space inside of -+ # a literal multiline string with unfortunately placed -+ # whitespace -+ -+ current_state = (self.backslashed or self.triplequoted) -+ -+ if re.search(r"\\$", line): -+ self.backslashed = True -+ else: -+ self.backslashed = False -+ -+ triples = len(re.findall(r"\"\"\"|\'\'\'", line)) -+ if triples == 1 or triples % 2 != 0: -+ self.triplequoted = not self.triplequoted -+ -+ return current_state -+ -+ def _flush_adjusted_lines(self): -+ stripspace = None -+ self._reset_multi_line_flags() -+ -+ for entry in self.line_buffer: -+ if self._in_multi_line(entry): -+ self.stream.write(entry + "\n") -+ else: -+ entry = entry.expandtabs() -+ if stripspace is None and re.search(r"^[ \t]*[^# \t]", entry): -+ stripspace = re.match(r"^([ \t]*)", entry).group(1) -+ self.stream.write(self._indent_line(entry, stripspace) + "\n") -+ -+ self.line_buffer = [] -+ self._reset_multi_line_flags() -+ -+ -+def adjust_whitespace(text): -+ """remove the left-whitespace margin of a block of Python code.""" -+ -+ state = [False, False] -+ (backslashed, triplequoted) = (0, 1) -+ -+ def in_multi_line(line): -+ start_state = (state[backslashed] or state[triplequoted]) -+ -+ if re.search(r"\\$", line): -+ state[backslashed] = True -+ else: -+ state[backslashed] = False -+ -+ def match(reg, t): -+ m = re.match(reg, t) -+ if m: -+ return m, t[len(m.group(0)):] -+ else: -+ return None, t -+ -+ while line: -+ if state[triplequoted]: -+ m, line = match(r"%s" % state[triplequoted], line) -+ if m: -+ state[triplequoted] = False -+ else: -+ m, line = match(r".*?(?=%s|$)" % state[triplequoted], line) -+ else: -+ m, line = match(r'#', line) -+ if m: -+ return start_state -+ -+ m, line = match(r"\"\"\"|\'\'\'", line) -+ if m: -+ state[triplequoted] = m.group(0) -+ continue -+ -+ m, line = match(r".*?(?=\"\"\"|\'\'\'|#|$)", line) -+ -+ return start_state -+ -+ def _indent_line(line, stripspace=''): -+ return re.sub(r"^%s" % stripspace, '', line) -+ -+ lines = [] -+ stripspace = None -+ -+ for line in re.split(r'\r?\n', text): -+ if in_multi_line(line): -+ lines.append(line) -+ else: -+ line = line.expandtabs() -+ if stripspace is None and re.search(r"^[ \t]*[^# \t]", line): -+ stripspace = re.match(r"^([ \t]*)", line).group(1) -+ lines.append(_indent_line(line, stripspace)) -+ return "\n".join(lines) -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py -new file mode 100644 -index 0000000..bfa46a9 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py -@@ -0,0 +1,232 @@ -+# mako/pyparser.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+"""Handles parsing of Python code. -+ -+Parsing to AST is done via _ast on Python > 2.5, otherwise the compiler -+module is used. -+""" -+ -+from mako import exceptions, util, compat -+from mako.compat import arg_stringname -+import operator -+ -+if compat.py3k: -+ # words that cannot be assigned to (notably -+ # smaller than the total keys in __builtins__) -+ reserved = set(['True', 'False', 'None', 'print']) -+ -+ # the "id" attribute on a function node -+ arg_id = operator.attrgetter('arg') -+else: -+ # words that cannot be assigned to (notably -+ # smaller than the total keys in __builtins__) -+ reserved = set(['True', 'False', 'None']) -+ -+ # the "id" attribute on a function node -+ arg_id = operator.attrgetter('id') -+ -+import _ast -+util.restore__ast(_ast) -+from mako import _ast_util -+ -+ -+def parse(code, mode='exec', **exception_kwargs): -+ """Parse an expression into AST""" -+ -+ try: -+ return _ast_util.parse(code, '', mode) -+ except Exception: -+ raise exceptions.SyntaxException( -+ "(%s) %s (%r)" % ( -+ compat.exception_as().__class__.__name__, -+ compat.exception_as(), -+ code[0:50] -+ ), **exception_kwargs) -+ -+ -+class FindIdentifiers(_ast_util.NodeVisitor): -+ -+ def __init__(self, listener, **exception_kwargs): -+ self.in_function = False -+ self.in_assign_targets = False -+ self.local_ident_stack = set() -+ self.listener = listener -+ self.exception_kwargs = exception_kwargs -+ -+ def _add_declared(self, name): -+ if not self.in_function: -+ self.listener.declared_identifiers.add(name) -+ else: -+ self.local_ident_stack.add(name) -+ -+ def visit_ClassDef(self, node): -+ self._add_declared(node.name) -+ -+ def visit_Assign(self, node): -+ -+ # flip around the visiting of Assign so the expression gets -+ # evaluated first, in the case of a clause like "x=x+5" (x -+ # is undeclared) -+ -+ self.visit(node.value) -+ in_a = self.in_assign_targets -+ self.in_assign_targets = True -+ for n in node.targets: -+ self.visit(n) -+ self.in_assign_targets = in_a -+ -+ if compat.py3k: -+ -+ # ExceptHandler is in Python 2, but this block only works in -+ # Python 3 (and is required there) -+ -+ def visit_ExceptHandler(self, node): -+ if node.name is not None: -+ self._add_declared(node.name) -+ if node.type is not None: -+ self.visit(node.type) -+ for statement in node.body: -+ self.visit(statement) -+ -+ def visit_Lambda(self, node, *args): -+ self._visit_function(node, True) -+ -+ def visit_FunctionDef(self, node): -+ self._add_declared(node.name) -+ self._visit_function(node, False) -+ -+ def _expand_tuples(self, args): -+ for arg in args: -+ if isinstance(arg, _ast.Tuple): -+ for n in arg.elts: -+ yield n -+ else: -+ yield arg -+ -+ def _visit_function(self, node, islambda): -+ -+ # push function state onto stack. dont log any more -+ # identifiers as "declared" until outside of the function, -+ # but keep logging identifiers as "undeclared". track -+ # argument names in each function header so they arent -+ # counted as "undeclared" -+ -+ inf = self.in_function -+ self.in_function = True -+ -+ local_ident_stack = self.local_ident_stack -+ self.local_ident_stack = local_ident_stack.union([ -+ arg_id(arg) for arg in self._expand_tuples(node.args.args) -+ ]) -+ if islambda: -+ self.visit(node.body) -+ else: -+ for n in node.body: -+ self.visit(n) -+ self.in_function = inf -+ self.local_ident_stack = local_ident_stack -+ -+ def visit_For(self, node): -+ -+ # flip around visit -+ -+ self.visit(node.iter) -+ self.visit(node.target) -+ for statement in node.body: -+ self.visit(statement) -+ for statement in node.orelse: -+ self.visit(statement) -+ -+ def visit_Name(self, node): -+ if isinstance(node.ctx, _ast.Store): -+ # this is eqiuvalent to visit_AssName in -+ # compiler -+ self._add_declared(node.id) -+ elif node.id not in reserved and node.id \ -+ not in self.listener.declared_identifiers and node.id \ -+ not in self.local_ident_stack: -+ self.listener.undeclared_identifiers.add(node.id) -+ -+ def visit_Import(self, node): -+ for name in node.names: -+ if name.asname is not None: -+ self._add_declared(name.asname) -+ else: -+ self._add_declared(name.name.split('.')[0]) -+ -+ def visit_ImportFrom(self, node): -+ for name in node.names: -+ if name.asname is not None: -+ self._add_declared(name.asname) -+ else: -+ if name.name == '*': -+ raise exceptions.CompileException( -+ "'import *' is not supported, since all identifier " -+ "names must be explicitly declared. Please use the " -+ "form 'from import , , " -+ "...' instead.", **self.exception_kwargs) -+ self._add_declared(name.name) -+ -+ -+class FindTuple(_ast_util.NodeVisitor): -+ -+ def __init__(self, listener, code_factory, **exception_kwargs): -+ self.listener = listener -+ self.exception_kwargs = exception_kwargs -+ self.code_factory = code_factory -+ -+ def visit_Tuple(self, node): -+ for n in node.elts: -+ p = self.code_factory(n, **self.exception_kwargs) -+ self.listener.codeargs.append(p) -+ self.listener.args.append(ExpressionGenerator(n).value()) -+ self.listener.declared_identifiers = \ -+ self.listener.declared_identifiers.union( -+ p.declared_identifiers) -+ self.listener.undeclared_identifiers = \ -+ self.listener.undeclared_identifiers.union( -+ p.undeclared_identifiers) -+ -+ -+class ParseFunc(_ast_util.NodeVisitor): -+ -+ def __init__(self, listener, **exception_kwargs): -+ self.listener = listener -+ self.exception_kwargs = exception_kwargs -+ -+ def visit_FunctionDef(self, node): -+ self.listener.funcname = node.name -+ -+ argnames = [arg_id(arg) for arg in node.args.args] -+ if node.args.vararg: -+ argnames.append(arg_stringname(node.args.vararg)) -+ -+ if compat.py2k: -+ # kw-only args don't exist in Python 2 -+ kwargnames = [] -+ else: -+ kwargnames = [arg_id(arg) for arg in node.args.kwonlyargs] -+ if node.args.kwarg: -+ kwargnames.append(arg_stringname(node.args.kwarg)) -+ self.listener.argnames = argnames -+ self.listener.defaults = node.args.defaults # ast -+ self.listener.kwargnames = kwargnames -+ if compat.py2k: -+ self.listener.kwdefaults = [] -+ else: -+ self.listener.kwdefaults = node.args.kw_defaults -+ self.listener.varargs = node.args.vararg -+ self.listener.kwargs = node.args.kwarg -+ -+class ExpressionGenerator(object): -+ -+ def __init__(self, astnode): -+ self.generator = _ast_util.SourceGenerator(' ' * 4) -+ self.generator.visit(astnode) -+ -+ def value(self): -+ return ''.join(self.generator.result) -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py -new file mode 100644 -index 0000000..6b6a35a ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py -@@ -0,0 +1,878 @@ -+# mako/runtime.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+"""provides runtime services for templates, including Context, -+Namespace, and various helper functions.""" -+ -+from mako import exceptions, util, compat -+from mako.compat import compat_builtins -+import sys -+ -+ -+class Context(object): -+ """Provides runtime namespace, output buffer, and various -+ callstacks for templates. -+ -+ See :ref:`runtime_toplevel` for detail on the usage of -+ :class:`.Context`. -+ -+ """ -+ -+ def __init__(self, buffer, **data): -+ self._buffer_stack = [buffer] -+ -+ self._data = data -+ -+ self._kwargs = data.copy() -+ self._with_template = None -+ self._outputting_as_unicode = None -+ self.namespaces = {} -+ -+ # "capture" function which proxies to the -+ # generic "capture" function -+ self._data['capture'] = compat.partial(capture, self) -+ -+ # "caller" stack used by def calls with content -+ self.caller_stack = self._data['caller'] = CallerStack() -+ -+ def _set_with_template(self, t): -+ self._with_template = t -+ illegal_names = t.reserved_names.intersection(self._data) -+ if illegal_names: -+ raise exceptions.NameConflictError( -+ "Reserved words passed to render(): %s" % -+ ", ".join(illegal_names)) -+ -+ @property -+ def lookup(self): -+ """Return the :class:`.TemplateLookup` associated -+ with this :class:`.Context`. -+ -+ """ -+ return self._with_template.lookup -+ -+ @property -+ def kwargs(self): -+ """Return the dictionary of top level keyword arguments associated -+ with this :class:`.Context`. -+ -+ This dictionary only includes the top-level arguments passed to -+ :meth:`.Template.render`. It does not include names produced within -+ the template execution such as local variable names or special names -+ such as ``self``, ``next``, etc. -+ -+ The purpose of this dictionary is primarily for the case that -+ a :class:`.Template` accepts arguments via its ``<%page>`` tag, -+ which are normally expected to be passed via :meth:`.Template.render`, -+ except the template is being called in an inheritance context, -+ using the ``body()`` method. :attr:`.Context.kwargs` can then be -+ used to propagate these arguments to the inheriting template:: -+ -+ ${next.body(**context.kwargs)} -+ -+ """ -+ return self._kwargs.copy() -+ -+ def push_caller(self, caller): -+ """Push a ``caller`` callable onto the callstack for -+ this :class:`.Context`.""" -+ -+ -+ self.caller_stack.append(caller) -+ -+ def pop_caller(self): -+ """Pop a ``caller`` callable onto the callstack for this -+ :class:`.Context`.""" -+ -+ del self.caller_stack[-1] -+ -+ def keys(self): -+ """Return a list of all names established in this :class:`.Context`.""" -+ -+ return list(self._data.keys()) -+ -+ def __getitem__(self, key): -+ if key in self._data: -+ return self._data[key] -+ else: -+ return compat_builtins.__dict__[key] -+ -+ def _push_writer(self): -+ """push a capturing buffer onto this Context and return -+ the new writer function.""" -+ -+ buf = util.FastEncodingBuffer() -+ self._buffer_stack.append(buf) -+ return buf.write -+ -+ def _pop_buffer_and_writer(self): -+ """pop the most recent capturing buffer from this Context -+ and return the current writer after the pop. -+ -+ """ -+ -+ buf = self._buffer_stack.pop() -+ return buf, self._buffer_stack[-1].write -+ -+ def _push_buffer(self): -+ """push a capturing buffer onto this Context.""" -+ -+ self._push_writer() -+ -+ def _pop_buffer(self): -+ """pop the most recent capturing buffer from this Context.""" -+ -+ return self._buffer_stack.pop() -+ -+ def get(self, key, default=None): -+ """Return a value from this :class:`.Context`.""" -+ -+ return self._data.get(key, compat_builtins.__dict__.get(key, default)) -+ -+ def write(self, string): -+ """Write a string to this :class:`.Context` object's -+ underlying output buffer.""" -+ -+ self._buffer_stack[-1].write(string) -+ -+ def writer(self): -+ """Return the current writer function.""" -+ -+ return self._buffer_stack[-1].write -+ -+ def _copy(self): -+ c = Context.__new__(Context) -+ c._buffer_stack = self._buffer_stack -+ c._data = self._data.copy() -+ c._kwargs = self._kwargs -+ c._with_template = self._with_template -+ c._outputting_as_unicode = self._outputting_as_unicode -+ c.namespaces = self.namespaces -+ c.caller_stack = self.caller_stack -+ return c -+ -+ def _locals(self, d): -+ """Create a new :class:`.Context` with a copy of this -+ :class:`.Context`'s current state, -+ updated with the given dictionary. -+ -+ The :attr:`.Context.kwargs` collection remains -+ unaffected. -+ -+ -+ """ -+ -+ if not d: -+ return self -+ c = self._copy() -+ c._data.update(d) -+ return c -+ -+ def _clean_inheritance_tokens(self): -+ """create a new copy of this :class:`.Context`. with -+ tokens related to inheritance state removed.""" -+ -+ c = self._copy() -+ x = c._data -+ x.pop('self', None) -+ x.pop('parent', None) -+ x.pop('next', None) -+ return c -+ -+class CallerStack(list): -+ def __init__(self): -+ self.nextcaller = None -+ -+ def __nonzero__(self): -+ return self.__bool__() -+ -+ def __bool__(self): -+ return len(self) and self._get_caller() and True or False -+ -+ def _get_caller(self): -+ # this method can be removed once -+ # codegen MAGIC_NUMBER moves past 7 -+ return self[-1] -+ -+ def __getattr__(self, key): -+ return getattr(self._get_caller(), key) -+ -+ def _push_frame(self): -+ frame = self.nextcaller or None -+ self.append(frame) -+ self.nextcaller = None -+ return frame -+ -+ def _pop_frame(self): -+ self.nextcaller = self.pop() -+ -+ -+class Undefined(object): -+ """Represents an undefined value in a template. -+ -+ All template modules have a constant value -+ ``UNDEFINED`` present which is an instance of this -+ object. -+ -+ """ -+ def __str__(self): -+ raise NameError("Undefined") -+ -+ def __nonzero__(self): -+ return self.__bool__() -+ -+ def __bool__(self): -+ return False -+ -+UNDEFINED = Undefined() -+ -+class LoopStack(object): -+ """a stack for LoopContexts that implements the context manager protocol -+ to automatically pop off the top of the stack on context exit -+ """ -+ -+ def __init__(self): -+ self.stack = [] -+ -+ def _enter(self, iterable): -+ self._push(iterable) -+ return self._top -+ -+ def _exit(self): -+ self._pop() -+ return self._top -+ -+ @property -+ def _top(self): -+ if self.stack: -+ return self.stack[-1] -+ else: -+ return self -+ -+ def _pop(self): -+ return self.stack.pop() -+ -+ def _push(self, iterable): -+ new = LoopContext(iterable) -+ if self.stack: -+ new.parent = self.stack[-1] -+ return self.stack.append(new) -+ -+ def __getattr__(self, key): -+ raise exceptions.RuntimeException("No loop context is established") -+ -+ def __iter__(self): -+ return iter(self._top) -+ -+ -+class LoopContext(object): -+ """A magic loop variable. -+ Automatically accessible in any ``% for`` block. -+ -+ See the section :ref:`loop_context` for usage -+ notes. -+ -+ :attr:`parent` -> :class:`.LoopContext` or ``None`` -+ The parent loop, if one exists. -+ :attr:`index` -> `int` -+ The 0-based iteration count. -+ :attr:`reverse_index` -> `int` -+ The number of iterations remaining. -+ :attr:`first` -> `bool` -+ ``True`` on the first iteration, ``False`` otherwise. -+ :attr:`last` -> `bool` -+ ``True`` on the last iteration, ``False`` otherwise. -+ :attr:`even` -> `bool` -+ ``True`` when ``index`` is even. -+ :attr:`odd` -> `bool` -+ ``True`` when ``index`` is odd. -+ """ -+ -+ def __init__(self, iterable): -+ self._iterable = iterable -+ self.index = 0 -+ self.parent = None -+ -+ def __iter__(self): -+ for i in self._iterable: -+ yield i -+ self.index += 1 -+ -+ @util.memoized_instancemethod -+ def __len__(self): -+ return len(self._iterable) -+ -+ @property -+ def reverse_index(self): -+ return len(self) - self.index - 1 -+ -+ @property -+ def first(self): -+ return self.index == 0 -+ -+ @property -+ def last(self): -+ return self.index == len(self) - 1 -+ -+ @property -+ def even(self): -+ return not self.odd -+ -+ @property -+ def odd(self): -+ return bool(self.index % 2) -+ -+ def cycle(self, *values): -+ """Cycle through values as the loop progresses. -+ """ -+ if not values: -+ raise ValueError("You must provide values to cycle through") -+ return values[self.index % len(values)] -+ -+ -+class _NSAttr(object): -+ def __init__(self, parent): -+ self.__parent = parent -+ def __getattr__(self, key): -+ ns = self.__parent -+ while ns: -+ if hasattr(ns.module, key): -+ return getattr(ns.module, key) -+ else: -+ ns = ns.inherits -+ raise AttributeError(key) -+ -+class Namespace(object): -+ """Provides access to collections of rendering methods, which -+ can be local, from other templates, or from imported modules. -+ -+ To access a particular rendering method referenced by a -+ :class:`.Namespace`, use plain attribute access: -+ -+ .. sourcecode:: mako -+ -+ ${some_namespace.foo(x, y, z)} -+ -+ :class:`.Namespace` also contains several built-in attributes -+ described here. -+ -+ """ -+ -+ def __init__(self, name, context, -+ callables=None, inherits=None, -+ populate_self=True, calling_uri=None): -+ self.name = name -+ self.context = context -+ self.inherits = inherits -+ if callables is not None: -+ self.callables = dict([(c.__name__, c) for c in callables]) -+ -+ callables = () -+ -+ module = None -+ """The Python module referenced by this :class:`.Namespace`. -+ -+ If the namespace references a :class:`.Template`, then -+ this module is the equivalent of ``template.module``, -+ i.e. the generated module for the template. -+ -+ """ -+ -+ template = None -+ """The :class:`.Template` object referenced by this -+ :class:`.Namespace`, if any. -+ -+ """ -+ -+ context = None -+ """The :class:`.Context` object for this :class:`.Namespace`. -+ -+ Namespaces are often created with copies of contexts that -+ contain slightly different data, particularly in inheritance -+ scenarios. Using the :class:`.Context` off of a :class:`.Namespace` one -+ can traverse an entire chain of templates that inherit from -+ one-another. -+ -+ """ -+ -+ filename = None -+ """The path of the filesystem file used for this -+ :class:`.Namespace`'s module or template. -+ -+ If this is a pure module-based -+ :class:`.Namespace`, this evaluates to ``module.__file__``. If a -+ template-based namespace, it evaluates to the original -+ template file location. -+ -+ """ -+ -+ uri = None -+ """The URI for this :class:`.Namespace`'s template. -+ -+ I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`. -+ -+ This is the equivalent of :attr:`.Template.uri`. -+ -+ """ -+ -+ _templateuri = None -+ -+ @util.memoized_property -+ def attr(self): -+ """Access module level attributes by name. -+ -+ This accessor allows templates to supply "scalar" -+ attributes which are particularly handy in inheritance -+ relationships. -+ -+ .. seealso:: -+ -+ :ref:`inheritance_attr` -+ -+ :ref:`namespace_attr_for_includes` -+ -+ """ -+ return _NSAttr(self) -+ -+ def get_namespace(self, uri): -+ """Return a :class:`.Namespace` corresponding to the given ``uri``. -+ -+ If the given ``uri`` is a relative URI (i.e. it does not -+ contain a leading slash ``/``), the ``uri`` is adjusted to -+ be relative to the ``uri`` of the namespace itself. This -+ method is therefore mostly useful off of the built-in -+ ``local`` namespace, described in :ref:`namespace_local`. -+ -+ In -+ most cases, a template wouldn't need this function, and -+ should instead use the ``<%namespace>`` tag to load -+ namespaces. However, since all ``<%namespace>`` tags are -+ evaluated before the body of a template ever runs, -+ this method can be used to locate namespaces using -+ expressions that were generated within the body code of -+ the template, or to conditionally use a particular -+ namespace. -+ -+ """ -+ key = (self, uri) -+ if key in self.context.namespaces: -+ return self.context.namespaces[key] -+ else: -+ ns = TemplateNamespace(uri, self.context._copy(), -+ templateuri=uri, -+ calling_uri=self._templateuri) -+ self.context.namespaces[key] = ns -+ return ns -+ -+ def get_template(self, uri): -+ """Return a :class:`.Template` from the given ``uri``. -+ -+ The ``uri`` resolution is relative to the ``uri`` of this -+ :class:`.Namespace` object's :class:`.Template`. -+ -+ """ -+ return _lookup_template(self.context, uri, self._templateuri) -+ -+ def get_cached(self, key, **kwargs): -+ """Return a value from the :class:`.Cache` referenced by this -+ :class:`.Namespace` object's :class:`.Template`. -+ -+ The advantage to this method versus direct access to the -+ :class:`.Cache` is that the configuration parameters -+ declared in ``<%page>`` take effect here, thereby calling -+ up the same configured backend as that configured -+ by ``<%page>``. -+ -+ """ -+ -+ return self.cache.get(key, **kwargs) -+ -+ @property -+ def cache(self): -+ """Return the :class:`.Cache` object referenced -+ by this :class:`.Namespace` object's -+ :class:`.Template`. -+ -+ """ -+ return self.template.cache -+ -+ def include_file(self, uri, **kwargs): -+ """Include a file at the given ``uri``.""" -+ -+ _include_file(self.context, uri, self._templateuri, **kwargs) -+ -+ def _populate(self, d, l): -+ for ident in l: -+ if ident == '*': -+ for (k, v) in self._get_star(): -+ d[k] = v -+ else: -+ d[ident] = getattr(self, ident) -+ -+ def _get_star(self): -+ if self.callables: -+ for key in self.callables: -+ yield (key, self.callables[key]) -+ -+ def __getattr__(self, key): -+ if key in self.callables: -+ val = self.callables[key] -+ elif self.inherits: -+ val = getattr(self.inherits, key) -+ else: -+ raise AttributeError( -+ "Namespace '%s' has no member '%s'" % -+ (self.name, key)) -+ setattr(self, key, val) -+ return val -+ -+class TemplateNamespace(Namespace): -+ """A :class:`.Namespace` specific to a :class:`.Template` instance.""" -+ -+ def __init__(self, name, context, template=None, templateuri=None, -+ callables=None, inherits=None, -+ populate_self=True, calling_uri=None): -+ self.name = name -+ self.context = context -+ self.inherits = inherits -+ if callables is not None: -+ self.callables = dict([(c.__name__, c) for c in callables]) -+ -+ if templateuri is not None: -+ self.template = _lookup_template(context, templateuri, -+ calling_uri) -+ self._templateuri = self.template.module._template_uri -+ elif template is not None: -+ self.template = template -+ self._templateuri = template.module._template_uri -+ else: -+ raise TypeError("'template' argument is required.") -+ -+ if populate_self: -+ lclcallable, lclcontext = \ -+ _populate_self_namespace(context, self.template, -+ self_ns=self) -+ -+ @property -+ def module(self): -+ """The Python module referenced by this :class:`.Namespace`. -+ -+ If the namespace references a :class:`.Template`, then -+ this module is the equivalent of ``template.module``, -+ i.e. the generated module for the template. -+ -+ """ -+ return self.template.module -+ -+ @property -+ def filename(self): -+ """The path of the filesystem file used for this -+ :class:`.Namespace`'s module or template. -+ """ -+ return self.template.filename -+ -+ @property -+ def uri(self): -+ """The URI for this :class:`.Namespace`'s template. -+ -+ I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`. -+ -+ This is the equivalent of :attr:`.Template.uri`. -+ -+ """ -+ return self.template.uri -+ -+ def _get_star(self): -+ if self.callables: -+ for key in self.callables: -+ yield (key, self.callables[key]) -+ def get(key): -+ callable_ = self.template._get_def_callable(key) -+ return compat.partial(callable_, self.context) -+ for k in self.template.module._exports: -+ yield (k, get(k)) -+ -+ def __getattr__(self, key): -+ if key in self.callables: -+ val = self.callables[key] -+ elif self.template.has_def(key): -+ callable_ = self.template._get_def_callable(key) -+ val = compat.partial(callable_, self.context) -+ elif self.inherits: -+ val = getattr(self.inherits, key) -+ -+ else: -+ raise AttributeError( -+ "Namespace '%s' has no member '%s'" % -+ (self.name, key)) -+ setattr(self, key, val) -+ return val -+ -+class ModuleNamespace(Namespace): -+ """A :class:`.Namespace` specific to a Python module instance.""" -+ -+ def __init__(self, name, context, module, -+ callables=None, inherits=None, -+ populate_self=True, calling_uri=None): -+ self.name = name -+ self.context = context -+ self.inherits = inherits -+ if callables is not None: -+ self.callables = dict([(c.__name__, c) for c in callables]) -+ -+ mod = __import__(module) -+ for token in module.split('.')[1:]: -+ mod = getattr(mod, token) -+ self.module = mod -+ -+ @property -+ def filename(self): -+ """The path of the filesystem file used for this -+ :class:`.Namespace`'s module or template. -+ """ -+ return self.module.__file__ -+ -+ def _get_star(self): -+ if self.callables: -+ for key in self.callables: -+ yield (key, self.callables[key]) -+ for key in dir(self.module): -+ if key[0] != '_': -+ callable_ = getattr(self.module, key) -+ if compat.callable(callable_): -+ yield key, compat.partial(callable_, self.context) -+ -+ -+ def __getattr__(self, key): -+ if key in self.callables: -+ val = self.callables[key] -+ elif hasattr(self.module, key): -+ callable_ = getattr(self.module, key) -+ val = compat.partial(callable_, self.context) -+ elif self.inherits: -+ val = getattr(self.inherits, key) -+ else: -+ raise AttributeError( -+ "Namespace '%s' has no member '%s'" % -+ (self.name, key)) -+ setattr(self, key, val) -+ return val -+ -+def supports_caller(func): -+ """Apply a caller_stack compatibility decorator to a plain -+ Python function. -+ -+ See the example in :ref:`namespaces_python_modules`. -+ -+ """ -+ -+ def wrap_stackframe(context, *args, **kwargs): -+ context.caller_stack._push_frame() -+ try: -+ return func(context, *args, **kwargs) -+ finally: -+ context.caller_stack._pop_frame() -+ return wrap_stackframe -+ -+def capture(context, callable_, *args, **kwargs): -+ """Execute the given template def, capturing the output into -+ a buffer. -+ -+ See the example in :ref:`namespaces_python_modules`. -+ -+ """ -+ -+ if not compat.callable(callable_): -+ raise exceptions.RuntimeException( -+ "capture() function expects a callable as " -+ "its argument (i.e. capture(func, *args, **kwargs))" -+ ) -+ context._push_buffer() -+ try: -+ callable_(*args, **kwargs) -+ finally: -+ buf = context._pop_buffer() -+ return buf.getvalue() -+ -+def _decorate_toplevel(fn): -+ def decorate_render(render_fn): -+ def go(context, *args, **kw): -+ def y(*args, **kw): -+ return render_fn(context, *args, **kw) -+ try: -+ y.__name__ = render_fn.__name__[7:] -+ except TypeError: -+ # < Python 2.4 -+ pass -+ return fn(y)(context, *args, **kw) -+ return go -+ return decorate_render -+ -+def _decorate_inline(context, fn): -+ def decorate_render(render_fn): -+ dec = fn(render_fn) -+ def go(*args, **kw): -+ return dec(context, *args, **kw) -+ return go -+ return decorate_render -+ -+def _include_file(context, uri, calling_uri, **kwargs): -+ """locate the template from the given uri and include it in -+ the current output.""" -+ -+ template = _lookup_template(context, uri, calling_uri) -+ (callable_, ctx) = _populate_self_namespace( -+ context._clean_inheritance_tokens(), -+ template) -+ callable_(ctx, **_kwargs_for_include(callable_, context._data, **kwargs)) -+ -+def _inherit_from(context, uri, calling_uri): -+ """called by the _inherit method in template modules to set -+ up the inheritance chain at the start of a template's -+ execution.""" -+ -+ if uri is None: -+ return None -+ template = _lookup_template(context, uri, calling_uri) -+ self_ns = context['self'] -+ ih = self_ns -+ while ih.inherits is not None: -+ ih = ih.inherits -+ lclcontext = context._locals({'next': ih}) -+ ih.inherits = TemplateNamespace("self:%s" % template.uri, -+ lclcontext, -+ template=template, -+ populate_self=False) -+ context._data['parent'] = lclcontext._data['local'] = ih.inherits -+ callable_ = getattr(template.module, '_mako_inherit', None) -+ if callable_ is not None: -+ ret = callable_(template, lclcontext) -+ if ret: -+ return ret -+ -+ gen_ns = getattr(template.module, '_mako_generate_namespaces', None) -+ if gen_ns is not None: -+ gen_ns(context) -+ return (template.callable_, lclcontext) -+ -+def _lookup_template(context, uri, relativeto): -+ lookup = context._with_template.lookup -+ if lookup is None: -+ raise exceptions.TemplateLookupException( -+ "Template '%s' has no TemplateLookup associated" % -+ context._with_template.uri) -+ uri = lookup.adjust_uri(uri, relativeto) -+ try: -+ return lookup.get_template(uri) -+ except exceptions.TopLevelLookupException: -+ raise exceptions.TemplateLookupException(str(compat.exception_as())) -+ -+def _populate_self_namespace(context, template, self_ns=None): -+ if self_ns is None: -+ self_ns = TemplateNamespace('self:%s' % template.uri, -+ context, template=template, -+ populate_self=False) -+ context._data['self'] = context._data['local'] = self_ns -+ if hasattr(template.module, '_mako_inherit'): -+ ret = template.module._mako_inherit(template, context) -+ if ret: -+ return ret -+ return (template.callable_, context) -+ -+def _render(template, callable_, args, data, as_unicode=False): -+ """create a Context and return the string -+ output of the given template and template callable.""" -+ -+ if as_unicode: -+ buf = util.FastEncodingBuffer(as_unicode=True) -+ elif template.bytestring_passthrough: -+ buf = compat.StringIO() -+ else: -+ buf = util.FastEncodingBuffer( -+ as_unicode=as_unicode, -+ encoding=template.output_encoding, -+ errors=template.encoding_errors) -+ context = Context(buf, **data) -+ context._outputting_as_unicode = as_unicode -+ context._set_with_template(template) -+ -+ _render_context(template, callable_, context, *args, -+ **_kwargs_for_callable(callable_, data)) -+ return context._pop_buffer().getvalue() -+ -+def _kwargs_for_callable(callable_, data): -+ argspec = compat.inspect_func_args(callable_) -+ # for normal pages, **pageargs is usually present -+ if argspec[2]: -+ return data -+ -+ # for rendering defs from the top level, figure out the args -+ namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None] -+ kwargs = {} -+ for arg in namedargs: -+ if arg != 'context' and arg in data and arg not in kwargs: -+ kwargs[arg] = data[arg] -+ return kwargs -+ -+def _kwargs_for_include(callable_, data, **kwargs): -+ argspec = compat.inspect_func_args(callable_) -+ namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None] -+ for arg in namedargs: -+ if arg != 'context' and arg in data and arg not in kwargs: -+ kwargs[arg] = data[arg] -+ return kwargs -+ -+def _render_context(tmpl, callable_, context, *args, **kwargs): -+ import mako.template as template -+ # create polymorphic 'self' namespace for this -+ # template with possibly updated context -+ if not isinstance(tmpl, template.DefTemplate): -+ # if main render method, call from the base of the inheritance stack -+ (inherit, lclcontext) = _populate_self_namespace(context, tmpl) -+ _exec_template(inherit, lclcontext, args=args, kwargs=kwargs) -+ else: -+ # otherwise, call the actual rendering method specified -+ (inherit, lclcontext) = _populate_self_namespace(context, tmpl.parent) -+ _exec_template(callable_, context, args=args, kwargs=kwargs) -+ -+def _exec_template(callable_, context, args=None, kwargs=None): -+ """execute a rendering callable given the callable, a -+ Context, and optional explicit arguments -+ -+ the contextual Template will be located if it exists, and -+ the error handling options specified on that Template will -+ be interpreted here. -+ """ -+ template = context._with_template -+ if template is not None and \ -+ (template.format_exceptions or template.error_handler): -+ try: -+ callable_(context, *args, **kwargs) -+ except Exception: -+ _render_error(template, context, compat.exception_as()) -+ except: -+ e = sys.exc_info()[0] -+ _render_error(template, context, e) -+ else: -+ callable_(context, *args, **kwargs) -+ -+def _render_error(template, context, error): -+ if template.error_handler: -+ result = template.error_handler(context, error) -+ if not result: -+ compat.reraise(*sys.exc_info()) -+ else: -+ error_template = exceptions.html_error_template() -+ if context._outputting_as_unicode: -+ context._buffer_stack[:] = [ -+ util.FastEncodingBuffer(as_unicode=True)] -+ else: -+ context._buffer_stack[:] = [util.FastEncodingBuffer( -+ error_template.output_encoding, -+ error_template.encoding_errors)] -+ -+ context._set_with_template(error_template) -+ error_template.render_context(context, error=error) -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py -new file mode 100644 -index 0000000..fb61062 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py -@@ -0,0 +1,705 @@ -+# mako/template.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+"""Provides the Template class, a facade for parsing, generating and executing -+template strings, as well as template runtime operations.""" -+ -+from mako.lexer import Lexer -+from mako import runtime, util, exceptions, codegen, cache, compat -+import os -+import re -+import shutil -+import stat -+import sys -+import tempfile -+import types -+import weakref -+ -+ -+class Template(object): -+ """Represents a compiled template. -+ -+ :class:`.Template` includes a reference to the original -+ template source (via the :attr:`.source` attribute) -+ as well as the source code of the -+ generated Python module (i.e. the :attr:`.code` attribute), -+ as well as a reference to an actual Python module. -+ -+ :class:`.Template` is constructed using either a literal string -+ representing the template text, or a filename representing a filesystem -+ path to a source file. -+ -+ :param text: textual template source. This argument is mutually -+ exclusive versus the ``filename`` parameter. -+ -+ :param filename: filename of the source template. This argument is -+ mutually exclusive versus the ``text`` parameter. -+ -+ :param buffer_filters: string list of filters to be applied -+ to the output of ``%def``\ s which are buffered, cached, or otherwise -+ filtered, after all filters -+ defined with the ``%def`` itself have been applied. Allows the -+ creation of default expression filters that let the output -+ of return-valued ``%def``\ s "opt out" of that filtering via -+ passing special attributes or objects. -+ -+ :param bytestring_passthrough: When ``True``, and ``output_encoding`` is -+ set to ``None``, and :meth:`.Template.render` is used to render, -+ the `StringIO` or `cStringIO` buffer will be used instead of the -+ default "fast" buffer. This allows raw bytestrings in the -+ output stream, such as in expressions, to pass straight -+ through to the buffer. This flag is forced -+ to ``True`` if ``disable_unicode`` is also configured. -+ -+ .. versionadded:: 0.4 -+ Added to provide the same behavior as that of the previous series. -+ -+ :param cache_args: Dictionary of cache configuration arguments that -+ will be passed to the :class:`.CacheImpl`. See :ref:`caching_toplevel`. -+ -+ :param cache_dir: -+ -+ .. deprecated:: 0.6 -+ Use the ``'dir'`` argument in the ``cache_args`` dictionary. -+ See :ref:`caching_toplevel`. -+ -+ :param cache_enabled: Boolean flag which enables caching of this -+ template. See :ref:`caching_toplevel`. -+ -+ :param cache_impl: String name of a :class:`.CacheImpl` caching -+ implementation to use. Defaults to ``'beaker'``. -+ -+ :param cache_type: -+ -+ .. deprecated:: 0.6 -+ Use the ``'type'`` argument in the ``cache_args`` dictionary. -+ See :ref:`caching_toplevel`. -+ -+ :param cache_url: -+ -+ .. deprecated:: 0.6 -+ Use the ``'url'`` argument in the ``cache_args`` dictionary. -+ See :ref:`caching_toplevel`. -+ -+ :param default_filters: List of string filter names that will -+ be applied to all expressions. See :ref:`filtering_default_filters`. -+ -+ :param disable_unicode: Disables all awareness of Python Unicode -+ objects. See :ref:`unicode_disabled`. -+ -+ :param enable_loop: When ``True``, enable the ``loop`` context variable. -+ This can be set to ``False`` to support templates that may -+ be making usage of the name "``loop``". Individual templates can -+ re-enable the "loop" context by placing the directive -+ ``enable_loop="True"`` inside the ``<%page>`` tag -- see -+ :ref:`migrating_loop`. -+ -+ :param encoding_errors: Error parameter passed to ``encode()`` when -+ string encoding is performed. See :ref:`usage_unicode`. -+ -+ :param error_handler: Python callable which is called whenever -+ compile or runtime exceptions occur. The callable is passed -+ the current context as well as the exception. If the -+ callable returns ``True``, the exception is considered to -+ be handled, else it is re-raised after the function -+ completes. Is used to provide custom error-rendering -+ functions. -+ -+ :param format_exceptions: if ``True``, exceptions which occur during -+ the render phase of this template will be caught and -+ formatted into an HTML error page, which then becomes the -+ rendered result of the :meth:`.render` call. Otherwise, -+ runtime exceptions are propagated outwards. -+ -+ :param imports: String list of Python statements, typically individual -+ "import" lines, which will be placed into the module level -+ preamble of all generated Python modules. See the example -+ in :ref:`filtering_default_filters`. -+ -+ :param future_imports: String list of names to import from `__future__`. -+ These will be concatenated into a comma-separated string and inserted -+ into the beginning of the template, e.g. ``futures_imports=['FOO', -+ 'BAR']`` results in ``from __future__ import FOO, BAR``. If you're -+ interested in using features like the new division operator, you must -+ use future_imports to convey that to the renderer, as otherwise the -+ import will not appear as the first executed statement in the generated -+ code and will therefore not have the desired effect. -+ -+ :param input_encoding: Encoding of the template's source code. Can -+ be used in lieu of the coding comment. See -+ :ref:`usage_unicode` as well as :ref:`unicode_toplevel` for -+ details on source encoding. -+ -+ :param lookup: a :class:`.TemplateLookup` instance that will be used -+ for all file lookups via the ``<%namespace>``, -+ ``<%include>``, and ``<%inherit>`` tags. See -+ :ref:`usage_templatelookup`. -+ -+ :param module_directory: Filesystem location where generated -+ Python module files will be placed. -+ -+ :param module_filename: Overrides the filename of the generated -+ Python module file. For advanced usage only. -+ -+ :param module_writer: A callable which overrides how the Python -+ module is written entirely. The callable is passed the -+ encoded source content of the module and the destination -+ path to be written to. The default behavior of module writing -+ uses a tempfile in conjunction with a file move in order -+ to make the operation atomic. So a user-defined module -+ writing function that mimics the default behavior would be: -+ -+ .. sourcecode:: python -+ -+ import tempfile -+ import os -+ import shutil -+ -+ def module_writer(source, outputpath): -+ (dest, name) = \\ -+ tempfile.mkstemp( -+ dir=os.path.dirname(outputpath) -+ ) -+ -+ os.write(dest, source) -+ os.close(dest) -+ shutil.move(name, outputpath) -+ -+ from mako.template import Template -+ mytemplate = Template( -+ filename="index.html", -+ module_directory="/path/to/modules", -+ module_writer=module_writer -+ ) -+ -+ The function is provided for unusual configurations where -+ certain platform-specific permissions or other special -+ steps are needed. -+ -+ :param output_encoding: The encoding to use when :meth:`.render` -+ is called. -+ See :ref:`usage_unicode` as well as :ref:`unicode_toplevel`. -+ -+ :param preprocessor: Python callable which will be passed -+ the full template source before it is parsed. The return -+ result of the callable will be used as the template source -+ code. -+ -+ :param lexer_cls: A :class:`.Lexer` class used to parse -+ the template. The :class:`.Lexer` class is used by -+ default. -+ -+ .. versionadded:: 0.7.4 -+ -+ :param strict_undefined: Replaces the automatic usage of -+ ``UNDEFINED`` for any undeclared variables not located in -+ the :class:`.Context` with an immediate raise of -+ ``NameError``. The advantage is immediate reporting of -+ missing variables which include the name. -+ -+ .. versionadded:: 0.3.6 -+ -+ :param uri: string URI or other identifier for this template. -+ If not provided, the ``uri`` is generated from the filesystem -+ path, or from the in-memory identity of a non-file-based -+ template. The primary usage of the ``uri`` is to provide a key -+ within :class:`.TemplateLookup`, as well as to generate the -+ file path of the generated Python module file, if -+ ``module_directory`` is specified. -+ -+ """ -+ -+ lexer_cls = Lexer -+ -+ def __init__(self, -+ text=None, -+ filename=None, -+ uri=None, -+ format_exceptions=False, -+ error_handler=None, -+ lookup=None, -+ output_encoding=None, -+ encoding_errors='strict', -+ module_directory=None, -+ cache_args=None, -+ cache_impl='beaker', -+ cache_enabled=True, -+ cache_type=None, -+ cache_dir=None, -+ cache_url=None, -+ module_filename=None, -+ input_encoding=None, -+ disable_unicode=False, -+ module_writer=None, -+ bytestring_passthrough=False, -+ default_filters=None, -+ buffer_filters=(), -+ strict_undefined=False, -+ imports=None, -+ future_imports=None, -+ enable_loop=True, -+ preprocessor=None, -+ lexer_cls=None): -+ if uri: -+ self.module_id = re.sub(r'\W', "_", uri) -+ self.uri = uri -+ elif filename: -+ self.module_id = re.sub(r'\W', "_", filename) -+ drive, path = os.path.splitdrive(filename) -+ path = os.path.normpath(path).replace(os.path.sep, "/") -+ self.uri = path -+ else: -+ self.module_id = "memory:" + hex(id(self)) -+ self.uri = self.module_id -+ -+ u_norm = self.uri -+ if u_norm.startswith("/"): -+ u_norm = u_norm[1:] -+ u_norm = os.path.normpath(u_norm) -+ if u_norm.startswith(".."): -+ raise exceptions.TemplateLookupException( -+ "Template uri \"%s\" is invalid - " -+ "it cannot be relative outside " -+ "of the root path." % self.uri) -+ -+ self.input_encoding = input_encoding -+ self.output_encoding = output_encoding -+ self.encoding_errors = encoding_errors -+ self.disable_unicode = disable_unicode -+ self.bytestring_passthrough = bytestring_passthrough or disable_unicode -+ self.enable_loop = enable_loop -+ self.strict_undefined = strict_undefined -+ self.module_writer = module_writer -+ -+ if compat.py3k and disable_unicode: -+ raise exceptions.UnsupportedError( -+ "Mako for Python 3 does not " -+ "support disabling Unicode") -+ elif output_encoding and disable_unicode: -+ raise exceptions.UnsupportedError( -+ "output_encoding must be set to " -+ "None when disable_unicode is used.") -+ if default_filters is None: -+ if compat.py3k or self.disable_unicode: -+ self.default_filters = ['str'] -+ else: -+ self.default_filters = ['unicode'] -+ else: -+ self.default_filters = default_filters -+ self.buffer_filters = buffer_filters -+ -+ self.imports = imports -+ self.future_imports = future_imports -+ self.preprocessor = preprocessor -+ -+ if lexer_cls is not None: -+ self.lexer_cls = lexer_cls -+ -+ # if plain text, compile code in memory only -+ if text is not None: -+ (code, module) = _compile_text(self, text, filename) -+ self._code = code -+ self._source = text -+ ModuleInfo(module, None, self, filename, code, text) -+ elif filename is not None: -+ # if template filename and a module directory, load -+ # a filesystem-based module file, generating if needed -+ if module_filename is not None: -+ path = module_filename -+ elif module_directory is not None: -+ path = os.path.abspath( -+ os.path.join( -+ os.path.normpath(module_directory), -+ u_norm + ".py" -+ ) -+ ) -+ else: -+ path = None -+ module = self._compile_from_file(path, filename) -+ else: -+ raise exceptions.RuntimeException( -+ "Template requires text or filename") -+ -+ self.module = module -+ self.filename = filename -+ self.callable_ = self.module.render_body -+ self.format_exceptions = format_exceptions -+ self.error_handler = error_handler -+ self.lookup = lookup -+ -+ self.module_directory = module_directory -+ -+ self._setup_cache_args( -+ cache_impl, cache_enabled, cache_args, -+ cache_type, cache_dir, cache_url -+ ) -+ -+ -+ @util.memoized_property -+ def reserved_names(self): -+ if self.enable_loop: -+ return codegen.RESERVED_NAMES -+ else: -+ return codegen.RESERVED_NAMES.difference(['loop']) -+ -+ def _setup_cache_args(self, -+ cache_impl, cache_enabled, cache_args, -+ cache_type, cache_dir, cache_url): -+ self.cache_impl = cache_impl -+ self.cache_enabled = cache_enabled -+ if cache_args: -+ self.cache_args = cache_args -+ else: -+ self.cache_args = {} -+ -+ # transfer deprecated cache_* args -+ if cache_type: -+ self.cache_args['type'] = cache_type -+ if cache_dir: -+ self.cache_args['dir'] = cache_dir -+ if cache_url: -+ self.cache_args['url'] = cache_url -+ -+ def _compile_from_file(self, path, filename): -+ if path is not None: -+ util.verify_directory(os.path.dirname(path)) -+ filemtime = os.stat(filename)[stat.ST_MTIME] -+ if not os.path.exists(path) or \ -+ os.stat(path)[stat.ST_MTIME] < filemtime: -+ data = util.read_file(filename) -+ _compile_module_file( -+ self, -+ data, -+ filename, -+ path, -+ self.module_writer) -+ module = compat.load_module(self.module_id, path) -+ del sys.modules[self.module_id] -+ if module._magic_number != codegen.MAGIC_NUMBER: -+ data = util.read_file(filename) -+ _compile_module_file( -+ self, -+ data, -+ filename, -+ path, -+ self.module_writer) -+ module = compat.load_module(self.module_id, path) -+ del sys.modules[self.module_id] -+ ModuleInfo(module, path, self, filename, None, None) -+ else: -+ # template filename and no module directory, compile code -+ # in memory -+ data = util.read_file(filename) -+ code, module = _compile_text( -+ self, -+ data, -+ filename) -+ self._source = None -+ self._code = code -+ ModuleInfo(module, None, self, filename, code, None) -+ return module -+ -+ @property -+ def source(self): -+ """Return the template source code for this :class:`.Template`.""" -+ -+ return _get_module_info_from_callable(self.callable_).source -+ -+ @property -+ def code(self): -+ """Return the module source code for this :class:`.Template`.""" -+ -+ return _get_module_info_from_callable(self.callable_).code -+ -+ @util.memoized_property -+ def cache(self): -+ return cache.Cache(self) -+ -+ @property -+ def cache_dir(self): -+ return self.cache_args['dir'] -+ @property -+ def cache_url(self): -+ return self.cache_args['url'] -+ @property -+ def cache_type(self): -+ return self.cache_args['type'] -+ -+ def render(self, *args, **data): -+ """Render the output of this template as a string. -+ -+ If the template specifies an output encoding, the string -+ will be encoded accordingly, else the output is raw (raw -+ output uses `cStringIO` and can't handle multibyte -+ characters). A :class:`.Context` object is created corresponding -+ to the given data. Arguments that are explicitly declared -+ by this template's internal rendering method are also -+ pulled from the given ``*args``, ``**data`` members. -+ -+ """ -+ return runtime._render(self, self.callable_, args, data) -+ -+ def render_unicode(self, *args, **data): -+ """Render the output of this template as a unicode object.""" -+ -+ return runtime._render(self, -+ self.callable_, -+ args, -+ data, -+ as_unicode=True) -+ -+ def render_context(self, context, *args, **kwargs): -+ """Render this :class:`.Template` with the given context. -+ -+ The data is written to the context's buffer. -+ -+ """ -+ if getattr(context, '_with_template', None) is None: -+ context._set_with_template(self) -+ runtime._render_context(self, -+ self.callable_, -+ context, -+ *args, -+ **kwargs) -+ -+ def has_def(self, name): -+ return hasattr(self.module, "render_%s" % name) -+ -+ def get_def(self, name): -+ """Return a def of this template as a :class:`.DefTemplate`.""" -+ -+ return DefTemplate(self, getattr(self.module, "render_%s" % name)) -+ -+ def _get_def_callable(self, name): -+ return getattr(self.module, "render_%s" % name) -+ -+ @property -+ def last_modified(self): -+ return self.module._modified_time -+ -+class ModuleTemplate(Template): -+ """A Template which is constructed given an existing Python module. -+ -+ e.g.:: -+ -+ t = Template("this is a template") -+ f = file("mymodule.py", "w") -+ f.write(t.code) -+ f.close() -+ -+ import mymodule -+ -+ t = ModuleTemplate(mymodule) -+ print t.render() -+ -+ """ -+ -+ def __init__(self, module, -+ module_filename=None, -+ template=None, -+ template_filename=None, -+ module_source=None, -+ template_source=None, -+ output_encoding=None, -+ encoding_errors='strict', -+ disable_unicode=False, -+ bytestring_passthrough=False, -+ format_exceptions=False, -+ error_handler=None, -+ lookup=None, -+ cache_args=None, -+ cache_impl='beaker', -+ cache_enabled=True, -+ cache_type=None, -+ cache_dir=None, -+ cache_url=None, -+ ): -+ self.module_id = re.sub(r'\W', "_", module._template_uri) -+ self.uri = module._template_uri -+ self.input_encoding = module._source_encoding -+ self.output_encoding = output_encoding -+ self.encoding_errors = encoding_errors -+ self.disable_unicode = disable_unicode -+ self.bytestring_passthrough = bytestring_passthrough or disable_unicode -+ self.enable_loop = module._enable_loop -+ -+ if compat.py3k and disable_unicode: -+ raise exceptions.UnsupportedError( -+ "Mako for Python 3 does not " -+ "support disabling Unicode") -+ elif output_encoding and disable_unicode: -+ raise exceptions.UnsupportedError( -+ "output_encoding must be set to " -+ "None when disable_unicode is used.") -+ -+ self.module = module -+ self.filename = template_filename -+ ModuleInfo(module, -+ module_filename, -+ self, -+ template_filename, -+ module_source, -+ template_source) -+ -+ self.callable_ = self.module.render_body -+ self.format_exceptions = format_exceptions -+ self.error_handler = error_handler -+ self.lookup = lookup -+ self._setup_cache_args( -+ cache_impl, cache_enabled, cache_args, -+ cache_type, cache_dir, cache_url -+ ) -+ -+class DefTemplate(Template): -+ """A :class:`.Template` which represents a callable def in a parent -+ template.""" -+ -+ def __init__(self, parent, callable_): -+ self.parent = parent -+ self.callable_ = callable_ -+ self.output_encoding = parent.output_encoding -+ self.module = parent.module -+ self.encoding_errors = parent.encoding_errors -+ self.format_exceptions = parent.format_exceptions -+ self.error_handler = parent.error_handler -+ self.enable_loop = parent.enable_loop -+ self.lookup = parent.lookup -+ self.bytestring_passthrough = parent.bytestring_passthrough -+ -+ def get_def(self, name): -+ return self.parent.get_def(name) -+ -+class ModuleInfo(object): -+ """Stores information about a module currently loaded into -+ memory, provides reverse lookups of template source, module -+ source code based on a module's identifier. -+ -+ """ -+ _modules = weakref.WeakValueDictionary() -+ -+ def __init__(self, -+ module, -+ module_filename, -+ template, -+ template_filename, -+ module_source, -+ template_source): -+ self.module = module -+ self.module_filename = module_filename -+ self.template_filename = template_filename -+ self.module_source = module_source -+ self.template_source = template_source -+ self._modules[module.__name__] = template._mmarker = self -+ if module_filename: -+ self._modules[module_filename] = self -+ -+ @classmethod -+ def get_module_source_metadata(cls, module_source, full_line_map=False): -+ source_map = re.search( -+ r"__M_BEGIN_METADATA(.+?)__M_END_METADATA", -+ module_source, re.S).group(1) -+ source_map = compat.json.loads(source_map) -+ source_map['line_map'] = dict((int(k), int(v)) -+ for k, v in source_map['line_map'].items()) -+ if full_line_map: -+ f_line_map = source_map['full_line_map'] = [] -+ line_map = source_map['line_map'] -+ -+ curr_templ_line = 1 -+ for mod_line in range(1, max(line_map)): -+ if mod_line in line_map: -+ curr_templ_line = line_map[mod_line] -+ f_line_map.append(curr_templ_line) -+ return source_map -+ -+ @property -+ def code(self): -+ if self.module_source is not None: -+ return self.module_source -+ else: -+ return util.read_python_file(self.module_filename) -+ -+ @property -+ def source(self): -+ if self.template_source is not None: -+ if self.module._source_encoding and \ -+ not isinstance(self.template_source, compat.text_type): -+ return self.template_source.decode( -+ self.module._source_encoding) -+ else: -+ return self.template_source -+ else: -+ data = util.read_file(self.template_filename) -+ if self.module._source_encoding: -+ return data.decode(self.module._source_encoding) -+ else: -+ return data -+ -+def _compile(template, text, filename, generate_magic_comment): -+ lexer = template.lexer_cls(text, -+ filename, -+ disable_unicode=template.disable_unicode, -+ input_encoding=template.input_encoding, -+ preprocessor=template.preprocessor) -+ node = lexer.parse() -+ source = codegen.compile(node, -+ template.uri, -+ filename, -+ default_filters=template.default_filters, -+ buffer_filters=template.buffer_filters, -+ imports=template.imports, -+ future_imports=template.future_imports, -+ source_encoding=lexer.encoding, -+ generate_magic_comment=generate_magic_comment, -+ disable_unicode=template.disable_unicode, -+ strict_undefined=template.strict_undefined, -+ enable_loop=template.enable_loop, -+ reserved_names=template.reserved_names) -+ return source, lexer -+ -+def _compile_text(template, text, filename): -+ identifier = template.module_id -+ source, lexer = _compile(template, text, filename, -+ generate_magic_comment=template.disable_unicode) -+ -+ cid = identifier -+ if not compat.py3k and isinstance(cid, compat.text_type): -+ cid = cid.encode() -+ module = types.ModuleType(cid) -+ code = compile(source, cid, 'exec') -+ -+ # this exec() works for 2.4->3.3. -+ exec(code, module.__dict__, module.__dict__) -+ return (source, module) -+ -+def _compile_module_file(template, text, filename, outputpath, module_writer): -+ source, lexer = _compile(template, text, filename, -+ generate_magic_comment=True) -+ -+ if isinstance(source, compat.text_type): -+ source = source.encode(lexer.encoding or 'ascii') -+ -+ if module_writer: -+ module_writer(source, outputpath) -+ else: -+ # make tempfiles in the same location as the ultimate -+ # location. this ensures they're on the same filesystem, -+ # avoiding synchronization issues. -+ (dest, name) = tempfile.mkstemp(dir=os.path.dirname(outputpath)) -+ -+ os.write(dest, source) -+ os.close(dest) -+ shutil.move(name, outputpath) -+ -+def _get_module_info_from_callable(callable_): -+ if compat.py3k: -+ return _get_module_info(callable_.__globals__['__name__']) -+ else: -+ return _get_module_info(callable_.func_globals['__name__']) -+ -+def _get_module_info(filename): -+ return ModuleInfo._modules[filename] -+ -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py -new file mode 100644 -index 0000000..cba2ab7 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py -@@ -0,0 +1,360 @@ -+# mako/util.py -+# Copyright (C) 2006-2015 the Mako authors and contributors -+# -+# This module is part of Mako and is released under -+# the MIT License: http://www.opensource.org/licenses/mit-license.php -+ -+import re -+import collections -+import codecs -+import os -+from mako import compat -+import operator -+ -+def update_wrapper(decorated, fn): -+ decorated.__wrapped__ = fn -+ decorated.__name__ = fn.__name__ -+ return decorated -+ -+ -+class PluginLoader(object): -+ def __init__(self, group): -+ self.group = group -+ self.impls = {} -+ -+ def load(self, name): -+ if name in self.impls: -+ return self.impls[name]() -+ else: -+ import pkg_resources -+ for impl in pkg_resources.iter_entry_points( -+ self.group, -+ name): -+ self.impls[name] = impl.load -+ return impl.load() -+ else: -+ from mako import exceptions -+ raise exceptions.RuntimeException( -+ "Can't load plugin %s %s" % -+ (self.group, name)) -+ -+ def register(self, name, modulepath, objname): -+ def load(): -+ mod = __import__(modulepath) -+ for token in modulepath.split(".")[1:]: -+ mod = getattr(mod, token) -+ return getattr(mod, objname) -+ self.impls[name] = load -+ -+def verify_directory(dir): -+ """create and/or verify a filesystem directory.""" -+ -+ tries = 0 -+ -+ while not os.path.exists(dir): -+ try: -+ tries += 1 -+ os.makedirs(dir, compat.octal("0775")) -+ except: -+ if tries > 5: -+ raise -+ -+def to_list(x, default=None): -+ if x is None: -+ return default -+ if not isinstance(x, (list, tuple)): -+ return [x] -+ else: -+ return x -+ -+ -+class memoized_property(object): -+ """A read-only @property that is only evaluated once.""" -+ def __init__(self, fget, doc=None): -+ self.fget = fget -+ self.__doc__ = doc or fget.__doc__ -+ self.__name__ = fget.__name__ -+ -+ def __get__(self, obj, cls): -+ if obj is None: -+ return self -+ obj.__dict__[self.__name__] = result = self.fget(obj) -+ return result -+ -+class memoized_instancemethod(object): -+ """Decorate a method memoize its return value. -+ -+ Best applied to no-arg methods: memoization is not sensitive to -+ argument values, and will always return the same value even when -+ called with different arguments. -+ -+ """ -+ def __init__(self, fget, doc=None): -+ self.fget = fget -+ self.__doc__ = doc or fget.__doc__ -+ self.__name__ = fget.__name__ -+ -+ def __get__(self, obj, cls): -+ if obj is None: -+ return self -+ def oneshot(*args, **kw): -+ result = self.fget(obj, *args, **kw) -+ memo = lambda *a, **kw: result -+ memo.__name__ = self.__name__ -+ memo.__doc__ = self.__doc__ -+ obj.__dict__[self.__name__] = memo -+ return result -+ oneshot.__name__ = self.__name__ -+ oneshot.__doc__ = self.__doc__ -+ return oneshot -+ -+class SetLikeDict(dict): -+ """a dictionary that has some setlike methods on it""" -+ def union(self, other): -+ """produce a 'union' of this dict and another (at the key level). -+ -+ values in the second dict take precedence over that of the first""" -+ x = SetLikeDict(**self) -+ x.update(other) -+ return x -+ -+class FastEncodingBuffer(object): -+ """a very rudimentary buffer that is faster than StringIO, -+ but doesn't crash on unicode data like cStringIO.""" -+ -+ def __init__(self, encoding=None, errors='strict', as_unicode=False): -+ self.data = collections.deque() -+ self.encoding = encoding -+ if as_unicode: -+ self.delim = compat.u('') -+ else: -+ self.delim = '' -+ self.as_unicode = as_unicode -+ self.errors = errors -+ self.write = self.data.append -+ -+ def truncate(self): -+ self.data = collections.deque() -+ self.write = self.data.append -+ -+ def getvalue(self): -+ if self.encoding: -+ return self.delim.join(self.data).encode(self.encoding, -+ self.errors) -+ else: -+ return self.delim.join(self.data) -+ -+class LRUCache(dict): -+ """A dictionary-like object that stores a limited number of items, -+ discarding lesser used items periodically. -+ -+ this is a rewrite of LRUCache from Myghty to use a periodic timestamp-based -+ paradigm so that synchronization is not really needed. the size management -+ is inexact. -+ """ -+ -+ class _Item(object): -+ def __init__(self, key, value): -+ self.key = key -+ self.value = value -+ self.timestamp = compat.time_func() -+ def __repr__(self): -+ return repr(self.value) -+ -+ def __init__(self, capacity, threshold=.5): -+ self.capacity = capacity -+ self.threshold = threshold -+ -+ def __getitem__(self, key): -+ item = dict.__getitem__(self, key) -+ item.timestamp = compat.time_func() -+ return item.value -+ -+ def values(self): -+ return [i.value for i in dict.values(self)] -+ -+ def setdefault(self, key, value): -+ if key in self: -+ return self[key] -+ else: -+ self[key] = value -+ return value -+ -+ def __setitem__(self, key, value): -+ item = dict.get(self, key) -+ if item is None: -+ item = self._Item(key, value) -+ dict.__setitem__(self, key, item) -+ else: -+ item.value = value -+ self._manage_size() -+ -+ def _manage_size(self): -+ while len(self) > self.capacity + self.capacity * self.threshold: -+ bytime = sorted(dict.values(self), -+ key=operator.attrgetter('timestamp'), reverse=True) -+ for item in bytime[self.capacity:]: -+ try: -+ del self[item.key] -+ except KeyError: -+ # if we couldn't find a key, most likely some other thread -+ # broke in on us. loop around and try again -+ break -+ -+# Regexp to match python magic encoding line -+_PYTHON_MAGIC_COMMENT_re = re.compile( -+ r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)', -+ re.VERBOSE) -+ -+def parse_encoding(fp): -+ """Deduce the encoding of a Python source file (binary mode) from magic -+ comment. -+ -+ It does this in the same way as the `Python interpreter`__ -+ -+ .. __: http://docs.python.org/ref/encodings.html -+ -+ The ``fp`` argument should be a seekable file object in binary mode. -+ """ -+ pos = fp.tell() -+ fp.seek(0) -+ try: -+ line1 = fp.readline() -+ has_bom = line1.startswith(codecs.BOM_UTF8) -+ if has_bom: -+ line1 = line1[len(codecs.BOM_UTF8):] -+ -+ m = _PYTHON_MAGIC_COMMENT_re.match(line1.decode('ascii', 'ignore')) -+ if not m: -+ try: -+ import parser -+ parser.suite(line1.decode('ascii', 'ignore')) -+ except (ImportError, SyntaxError): -+ # Either it's a real syntax error, in which case the source -+ # is not valid python source, or line2 is a continuation of -+ # line1, in which case we don't want to scan line2 for a magic -+ # comment. -+ pass -+ else: -+ line2 = fp.readline() -+ m = _PYTHON_MAGIC_COMMENT_re.match( -+ line2.decode('ascii', 'ignore')) -+ -+ if has_bom: -+ if m: -+ raise SyntaxError("python refuses to compile code with both a UTF8" \ -+ " byte-order-mark and a magic encoding comment") -+ return 'utf_8' -+ elif m: -+ return m.group(1) -+ else: -+ return None -+ finally: -+ fp.seek(pos) -+ -+def sorted_dict_repr(d): -+ """repr() a dictionary with the keys in order. -+ -+ Used by the lexer unit test to compare parse trees based on strings. -+ -+ """ -+ keys = list(d.keys()) -+ keys.sort() -+ return "{" + ", ".join(["%r: %r" % (k, d[k]) for k in keys]) + "}" -+ -+def restore__ast(_ast): -+ """Attempt to restore the required classes to the _ast module if it -+ appears to be missing them -+ """ -+ if hasattr(_ast, 'AST'): -+ return -+ _ast.PyCF_ONLY_AST = 2 << 9 -+ m = compile("""\ -+def foo(): pass -+class Bar(object): pass -+if False: pass -+baz = 'mako' -+1 + 2 - 3 * 4 / 5 -+6 // 7 % 8 << 9 >> 10 -+11 & 12 ^ 13 | 14 -+15 and 16 or 17 -+-baz + (not +18) - ~17 -+baz and 'foo' or 'bar' -+(mako is baz == baz) is not baz != mako -+mako > baz < mako >= baz <= mako -+mako in baz not in mako""", '', 'exec', _ast.PyCF_ONLY_AST) -+ _ast.Module = type(m) -+ -+ for cls in _ast.Module.__mro__: -+ if cls.__name__ == 'mod': -+ _ast.mod = cls -+ elif cls.__name__ == 'AST': -+ _ast.AST = cls -+ -+ _ast.FunctionDef = type(m.body[0]) -+ _ast.ClassDef = type(m.body[1]) -+ _ast.If = type(m.body[2]) -+ -+ _ast.Name = type(m.body[3].targets[0]) -+ _ast.Store = type(m.body[3].targets[0].ctx) -+ _ast.Str = type(m.body[3].value) -+ -+ _ast.Sub = type(m.body[4].value.op) -+ _ast.Add = type(m.body[4].value.left.op) -+ _ast.Div = type(m.body[4].value.right.op) -+ _ast.Mult = type(m.body[4].value.right.left.op) -+ -+ _ast.RShift = type(m.body[5].value.op) -+ _ast.LShift = type(m.body[5].value.left.op) -+ _ast.Mod = type(m.body[5].value.left.left.op) -+ _ast.FloorDiv = type(m.body[5].value.left.left.left.op) -+ -+ _ast.BitOr = type(m.body[6].value.op) -+ _ast.BitXor = type(m.body[6].value.left.op) -+ _ast.BitAnd = type(m.body[6].value.left.left.op) -+ -+ _ast.Or = type(m.body[7].value.op) -+ _ast.And = type(m.body[7].value.values[0].op) -+ -+ _ast.Invert = type(m.body[8].value.right.op) -+ _ast.Not = type(m.body[8].value.left.right.op) -+ _ast.UAdd = type(m.body[8].value.left.right.operand.op) -+ _ast.USub = type(m.body[8].value.left.left.op) -+ -+ _ast.Or = type(m.body[9].value.op) -+ _ast.And = type(m.body[9].value.values[0].op) -+ -+ _ast.IsNot = type(m.body[10].value.ops[0]) -+ _ast.NotEq = type(m.body[10].value.ops[1]) -+ _ast.Is = type(m.body[10].value.left.ops[0]) -+ _ast.Eq = type(m.body[10].value.left.ops[1]) -+ -+ _ast.Gt = type(m.body[11].value.ops[0]) -+ _ast.Lt = type(m.body[11].value.ops[1]) -+ _ast.GtE = type(m.body[11].value.ops[2]) -+ _ast.LtE = type(m.body[11].value.ops[3]) -+ -+ _ast.In = type(m.body[12].value.ops[0]) -+ _ast.NotIn = type(m.body[12].value.ops[1]) -+ -+ -+ -+def read_file(path, mode='rb'): -+ fp = open(path, mode) -+ try: -+ data = fp.read() -+ return data -+ finally: -+ fp.close() -+ -+def read_python_file(path): -+ fp = open(path, "rb") -+ try: -+ encoding = parse_encoding(fp) -+ data = fp.read() -+ if encoding: -+ data = data.decode(encoding) -+ return data -+ finally: -+ fp.close() -+ -diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template -new file mode 100644 -index 0000000..5fbba17 ---- /dev/null -+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template -@@ -0,0 +1,106 @@ -+/****************************************************************************** -+* -+* Copyright 2015 -+* Intel Corporation -+* -+* Licensed under the Apache License, Version 2.0 (the "License"); -+* you may not use this file except in compliance with the License. -+* You may obtain a copy of the License at -+* -+* http ://www.apache.org/licenses/LICENSE-2.0 -+* -+* Unless required by applicable law or agreed to in writing, software -+* distributed under the License is distributed on an "AS IS" BASIS, -+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+* See the License for the specific language governing permissions and -+* limitations under the License. -+* -+% if gen_header: -+* @file ${filename}.h -+% else: -+* @file ${filename}.cpp -+% endif -+* -+* @brief Dynamic Knobs for Core. -+* -+* ======================= AUTO GENERATED: DO NOT EDIT !!! ==================== -+* -+******************************************************************************/ -+%if gen_header: -+#pragma once -+ -+template -+struct Knob -+{ -+ const T& Value() const { return m_Value; } -+ const T& Value(const T& newValue) { m_Value = newValue; return Value(); } -+ -+private: -+ T m_Value; -+}; -+ -+#define DEFINE_KNOB(_name, _type, _default) \\ -+ -+ struct Knob_##_name : Knob<_type> \\ -+ -+ { Knob_##_name() { Value(_default); } \\ -+ -+ const char* Name() const { return "KNOB_" #_name; } \\ -+ -+ } _name; -+ -+#define GET_KNOB(_name) g_GlobalKnobs._name.Value() -+#define SET_KNOB(_name, _newValue) g_GlobalKnobs._name.Value(_newValue) -+ -+struct GlobalKnobs -+{ -+ % for knob in knobs: -+ //----------------------------------------------------------- -+ // KNOB_${knob[0]} -+ // -+ % for line in knob[1]['desc']: -+ // ${line} -+ % endfor -+ DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']}); -+ -+ % endfor -+ GlobalKnobs(); -+}; -+extern GlobalKnobs g_GlobalKnobs; -+ -+<% -+ max_len = 0 -+ for knob in knobs: -+ if len(knob[0]) > max_len: max_len = len(knob[0]) -+ max_len += len('KNOB_ ') -+ if max_len % 4: max_len += 4 - (max_len % 4) -+ -+ def space_knob(knob): -+ knob_len = len('KNOB_' + knob) -+ return ' '*(max_len - knob_len) -+%> -+% for knob in knobs: -+#define KNOB_${knob[0]}${space_knob(knob[0])}GET_KNOB(${knob[0]}) -+% endfor -+ -+% else: -+% for inc in includes: -+#include <${inc}> -+% endfor -+ -+//======================================================== -+// Static Data Members -+//======================================================== -+GlobalKnobs g_GlobalKnobs; -+ -+//======================================================== -+// Knob Initialization -+//======================================================== -+GlobalKnobs::GlobalKnobs() -+{ -+ % for knob in knobs: -+ InitKnob(${knob[0]}); -+ % endfor -+} -+ -+% endif --- -2.6.2 - diff --git a/0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch b/0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch deleted file mode 100644 index 239130f..0000000 --- a/0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch +++ /dev/null @@ -1,42 +0,0 @@ -From fe9e5f557953d3c4b9c3cac6be0ff29d97c3f2c7 Mon Sep 17 00:00:00 2001 -From: Igor Gnatenko -Date: Thu, 22 Oct 2015 17:08:04 +0200 -Subject: [PATCH 3/3] gallium/swr: add flags parameter to - pipe_screen::context_create - -Signed-off-by: Igor Gnatenko ---- - src/gallium/drivers/swr/swr_context.cpp | 3 ++- - src/gallium/drivers/swr/swr_context.h | 2 +- - 2 files changed, 3 insertions(+), 2 deletions(-) - -diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp -index 6269cd0..2dd3443 100644 ---- a/src/gallium/drivers/swr/swr_context.cpp -+++ b/src/gallium/drivers/swr/swr_context.cpp -@@ -336,7 +336,8 @@ swr_render_condition(struct pipe_context *pipe, - - - struct pipe_context * --swr_create_context(struct pipe_screen *screen, void *priv) -+swr_create_context(struct pipe_screen *screen, void *priv, -+ unsigned flags) - { - struct swr_context *ctx = CALLOC_STRUCT(swr_context); - ctx->blendJIT = -diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h -index 9d93a6d..5271eac 100644 ---- a/src/gallium/drivers/swr/swr_context.h -+++ b/src/gallium/drivers/swr/swr_context.h -@@ -160,7 +160,7 @@ swr_context(struct pipe_context *pipe) - return (struct swr_context *)pipe; - } - --struct pipe_context *swr_create_context(struct pipe_screen *, void *priv); -+struct pipe_context *swr_create_context(struct pipe_screen *, void *priv, unsigned flags); - - void swr_state_init(struct pipe_context *pipe); - --- -2.6.2 - diff --git a/mesa.spec b/mesa.spec index 6ca184d..9c81d93 100644 --- a/mesa.spec +++ b/mesa.spec @@ -17,7 +17,6 @@ %define min_wayland_version 1.0 %if 0%{?with_llvm} %define with_radeonsi 1 -%define with_swr 1 %endif %ifarch s390 s390x ppc @@ -75,10 +74,6 @@ Patch15: mesa-9.2-hardware-float.patch Patch20: mesa-10.2-evergreen-big-endian.patch Patch30: mesa-10.3-bigendian-assert.patch -Patch101: 0001-Initial-public-Mesa-SWR.patch -Patch102: 0002-swr-484541-Initial-public-SWR.patch -Patch103: 0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch - # To have sha info in glxinfo BuildRequires: git-core @@ -353,10 +348,6 @@ grep -q ^/ src/gallium/auxiliary/vl/vl_decoder.c && exit 1 %patch20 -p1 -b .egbe %patch30 -p1 -b .beassert -%patch101 -p1 -%patch102 -p1 -%patch103 -p1 - %if 0%{with_private_llvm} sed -i 's/llvm-config/mesa-private-llvm-config-%{__isa_bits}/g' configure.ac sed -i 's/`$LLVM_CONFIG --version`/&-mesa/' configure.ac @@ -404,8 +395,7 @@ export CXXFLAGS="$RPM_OPT_FLAGS %{?with_opencl:-frtti -fexceptions} %{!?with_ope %if %{with_hardware} %{?with_xa:--enable-xa} \ %{?with_nine:--enable-nine} \ - --with-gallium-drivers=%{?with_vmware:svga,}%{?with_radeonsi:radeonsi,}%{?with_llvm:swrast,r600,}%{?with_freedreno:freedreno,}%{?with_vc4:vc4,}%{?with_ilo:ilo,}%{?with_swr:swr,}r300,nouveau \ - %{?with_swr:--enable-swr-native} \ + --with-gallium-drivers=%{?with_vmware:svga,}%{?with_radeonsi:radeonsi,}%{?with_llvm:swrast,r600,}%{?with_freedreno:freedreno,}%{?with_vc4:vc4,}%{?with_ilo:ilo,}r300,nouveau \ %else --with-gallium-drivers=%{?with_llvm:swrast} \ %endif @@ -687,6 +677,7 @@ rm -rf $RPM_BUILD_ROOT %changelog * Thu Oct 22 2015 Igor Gnatenko - 11.1.0-0.devel.10.7182498 - 7182498 +- Disable SWR rasterizer * Wed Oct 21 2015 Igor Gnatenko - 11.1.0-0.devel.9.4a168ad - Enable experimental SWR rasterizer